Module 5 Visualization Lab

# Load all required libraries
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(forcats)
library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

library(lubridate)
library(ggalluvial)
library(tidytext)
library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(completejourney)

## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.

library(grid)
library(stringr)
library(ggrepel)
library(ggnewscale)
library(patchwork)

# Load datasets
transactions <- get_transactions()
products     <- products
demographics <- demographics
coupons      <- coupons
campaigns    <- campaigns
coupon_redemptions <- coupon_redemptions

# ---- Normalize products columns ----
cat_candidates <- intersect(c("category", "product_category", "commodity", "subcategory"), names(products))
if (length(cat_candidates) == 0) {
  stop("No category-like column found in `products`. Available columns: ", paste(names(products), collapse = ", "))
}
cat_col <- cat_candidates[1]

products_dim <- products %>%
  setNames(replace(names(.), names(.) == cat_col, "category")) %>%
  dplyr::select(product_id,
                dplyr::any_of(c("department")),   # department exists in all CRAN versions
                dplyr::any_of(c("brand")),
                category)

knitr::opts_chunk$set(fig.width = 16, fig.height = 10, dpi = 200)

# SEASONALITY: monthly spend per department 

# 1) Build monthly spend and drop January
monthly_dept <- transactions %>%
  mutate(
    date  = as.Date(transaction_timestamp),
    month = lubridate::floor_date(date, "month"),
    sales = coalesce(sales_value, 0),
    mon_num = lubridate::month(month)
  ) %>%
  filter(mon_num != 1) %>%  # <-- remove January
  inner_join(products_dim %>% select(product_id, department), by = "product_id") %>%
  filter(!is.na(department), !is.na(month)) %>%
  group_by(department, month) %>%
  summarise(sales = sum(sales), .groups = "drop") %>%
  mutate(
    mon = lubridate::month(month, label = TRUE, abbr = TRUE),
    # Ensure months are ordered Feb..Dec after removing Jan
    mon = factor(as.character(mon), levels = month.abb[2:12])
  )

# 2) Choose departments to display
top_depts <- monthly_dept %>%
  group_by(department) %>% summarise(total_sales = sum(sales), .groups = "drop") %>%
  slice_max(total_sales, n = 4, with_ties = FALSE) %>%
  pull(department)

heat <- monthly_dept %>%
  filter(department %in% top_depts) %>%
  mutate(department = forcats::fct_reorder(department, -sales, .fun = sum))

# 3) Fixed brand colors (edit as desired)
dept_color_map <- c(
  "GROCERY"   = "#1f77b4",  # blue
  "GROCERIES" = "#1f77b4",
  "DRUG GM"   = "#2ca02c",  # green
  "DRUG"      = "#2ca02c",
  "PRODUCE"   = "#ff7f0e",  # orange
  "DELI"      = "#9467bd"   # purple
)
fallback_pairs <- list(
  c("#f1eef6", "#756bb1"),
  c("#fee0d2", "#e6550d"),
  c("#e5f5e0", "#31a354"),
  c("#deebf7", "#3182bd")
)
get_low_high <- function(dname, idx) {
  key <- toupper(dname)
  hi <- dept_color_map[key]
  if (is.na(hi)) {
    pair <- fallback_pairs[[ ((idx - 1) %% length(fallback_pairs)) + 1 ]]
    list(low = pair[1], high = pair[2])
  } else {
    list(low = "#ffffff", high = unname(hi))  # white -> brand color
  }
}

# 4) One panel per department (legend next to its own row), sqrt for stronger contrast
make_dept_panel <- function(dname, idx) {
  d <- dplyr::filter(heat, department == dname)
  cols <- get_low_high(dname, idx)
  ggplot(d, aes(x = mon, y = department, fill = sales)) +
    geom_tile(width = 0.95, height = 0.9) +
    scale_fill_gradient(
      low = cols$low, high = cols$high,
      labels = scales::dollar, name = paste0(dname, " monthly spend"),
      trans = "sqrt", oob = scales::squish
    ) +
    labs(x = "Month", y = NULL) +
    theme_minimal(base_size = 16) +
    theme(
      panel.grid = element_blank(),
      axis.title.y = element_blank(),
      axis.text.y  = element_text(face = "bold"),
      legend.position = "right",
      legend.title = element_text(size = 12),
      legend.text  = element_text(size = 11),
      plot.margin  = margin(t = 2, r = 10, b = 2, l = 2)
    )
}

dept_levels <- levels(forcats::fct_drop(heat$department))
panels <- Map(make_dept_panel, dept_levels, seq_along(dept_levels))

library(patchwork)
wrap_plots(panels, ncol = 1, guides = "keep") +
  plot_annotation(
    title = "Seasonality by Department: Actual Monthly Spend (January removed)",
    subtitle = "Each row is a department with its own color ramp and legend. January is excluded to prevent zero-spend from compressing the scale.",
    caption = "Data: completejourney | Joins: transactions + products | Metric: sum of sales_value per department per month (Feb–Dec)",
    theme = theme(
      plot.title = element_text(face = "bold", size = 24),
      plot.subtitle = element_text(size = 14, margin = margin(b = 10)),
      plot.caption = element_text(size = 11)
    )
  )

# WEEKDAY VS WEEKEND: Basket value by household size (with legend and labels)

# Build basket-level spend and weekend flag
baskets <- transactions %>%
  mutate(
    sales = coalesce(sales_value, 0),
    ts    = suppressWarnings(lubridate::ymd_hms(transaction_timestamp))
  ) %>%
  group_by(basket_id, household_id) %>%
  summarise(
    basket_sales = sum(sales),
    first_ts     = suppressWarnings(min(ts)),
    .groups      = "drop"
  ) %>%
  filter(!is.na(first_ts)) %>%
  mutate(
    wday_abbr = as.character(lubridate::wday(first_ts, label = TRUE, abbr = TRUE)),
    day_type  = if_else(wday_abbr %in% c("Sat","Sun"), "Weekend", "Weekday")
  ) %>%
  inner_join(demographics %>% select(household_id, household_size), by = "household_id") %>%
  filter(!is.na(household_size))

# Average basket by household_size x day_type
avg_by_day <- baskets %>%
  group_by(household_size, day_type) %>%
  summarise(avg_basket = mean(basket_sales, na.rm = TRUE), .groups = "drop") %>%
  mutate(household_size = factor(household_size, ordered = TRUE))

# Wide and gap (for ordering), then long for explicit endpoint colors/legend
avg_wide <- avg_by_day %>%
  tidyr::pivot_wider(names_from = day_type, values_from = avg_basket) %>%
  mutate(gap = Weekend - Weekday)

avg_long <- avg_wide %>%
  select(household_size, Weekday, Weekend) %>%
  tidyr::pivot_longer(cols = c("Weekday","Weekend"),
                      names_to = "End", values_to = "Value")

# Dumbbell: segment per row, colored endpoints with legend
max_x <- max(avg_long$Value, na.rm = TRUE)

ggplot() +
  geom_segment(
    data = avg_wide,
    aes(x = Weekday, xend = Weekend, y = household_size, yend = household_size),
    linewidth = 3, color = "grey85", na.rm = TRUE
  ) +
  # endpoints
  geom_point(
    data = avg_long,
    aes(x = Value, y = household_size, color = End),
    size = 4, na.rm = TRUE
  ) +
  # gap label on the right
  geom_text(
    data = avg_wide,
    aes(x = pmax(Weekday, Weekend, na.rm = TRUE),
        y = household_size,
        label = ifelse(is.finite(gap), scales::dollar(gap), "")),
    nudge_x = 0.02 * ifelse(is.finite(max_x), max_x, 0),
    size = 4, color = "grey20", na.rm = TRUE
  ) +
  scale_color_manual(values = c(Weekday = "#1f77b4", Weekend = "#ff7f0e"),
                     name = "Endpoint") +
  scale_x_continuous(labels = scales::dollar) +
  labs(
    title = "Weekend Baskets Are Larger For Most Household Sizes",
    subtitle = "Dumbbell shows average basket value (USD) by household size on Weekdays (blue, left) and Weekends (orange, right). Right-side labels show Weekend minus Weekday.",
    x = "Average Basket Value (USD)", y = "Household Size",
    caption = "Data: completejourney | Joins: transactions + demographics | Basket = sum of line item sales_value per basket_id"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    panel.grid.major.y = element_blank(),
    plot.title = element_text(face = "bold", size = 24),
    plot.subtitle = element_text(size = 14, margin = margin(b = 10)),
    plot.caption = element_text(size = 11),
    legend.position = "top",
    legend.title = element_text(size = 12),
    legend.text  = element_text(size = 11)
  )

# ---- Top-5 Categories by Income (excluding coupons/misc) ----

excluded_departments <- c(
  "COUPON/MISC ITEMS", "COUPONS", "MISC", "MISCELLANEOUS", "UNKNOWN"
)
excluded_category_regex <- stringr::regex(
  "\\bcoupon\\b|\\bmisc\\b|miscellaneous|unknown", ignore_case = TRUE
)

df_income_topcat <- transactions %>%
  mutate(sales = coalesce(sales_value, 0)) %>%
  inner_join(products_dim %>% select(product_id, category, department), by = "product_id") %>%
  inner_join(demographics %>% select(household_id, income), by = "household_id") %>%
  filter(
    !is.na(income), !is.na(category),
    !(toupper(department) %in% toupper(excluded_departments)),
    !stringr::str_detect(category, excluded_category_regex)
  ) %>%
  group_by(income, category) %>%
  summarise(total_sales = sum(sales), .groups = "drop") %>%
  filter(total_sales > 0) %>%
  group_by(income) %>%
  slice_max(total_sales, n = 5, with_ties = FALSE) %>%
  ungroup() %>%
  mutate(
    category_wrapped = stringr::str_wrap(category, 18),
    cat_within = tidytext::reorder_within(category_wrapped, total_sales, income),
    sales_lab = scales::dollar(total_sales)
  ) %>%
  group_by(income) %>%
  mutate(
    grp_max      = max(total_sales, na.rm = TRUE),
    place_inside = total_sales >= 0.90 * grp_max,    # label inside long bars
    hjust_lab    = ifelse(place_inside, 1.02, -0.05),
    color_lab    = ifelse(place_inside, "white", "grey20")
  ) %>%
  ungroup()

ggplot(df_income_topcat, aes(cat_within, total_sales)) +
  geom_col(fill = "#2c7fb8", width = 0.7) +
  ggrepel::geom_text_repel(
    aes(label = sales_lab, hjust = hjust_lab, color = color_lab),
    size = 3.3,
    direction = "y",
    min.segment.length = 0, segment.color = NA,
    force = 1, box.padding = 0.15, point.padding = 0.15, max.overlaps = Inf
  ) +
  facet_wrap(~ income, scales = "free_y", ncol = 3) +
  coord_flip(clip = "off") +
  tidytext::scale_x_reordered() +
  scale_y_continuous(labels = scales::dollar, expand = expansion(mult = c(0.02, 0.30))) +
  scale_color_identity() +
  labs(
    title = "What Each Income Group Actually Buys (Coupons/Misc Removed)",
    subtitle = paste0(
      "Each panel shows the top five product categories by total spend within that income band. ",
      "Coupon and miscellaneous items are excluded to focus on real product demand."
    ),
    x = NULL, y = "Total Spend (USD)",
    caption = "Source: completejourney | Built from transactions + products + demographics | Display: top 5 per income"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_text(face = "bold", size = 18),
    plot.subtitle = element_text(size = 12, margin = margin(b = 6)),
    strip.text = element_text(face = "bold"),
    plot.caption = element_text(color = "grey40"),
    panel.spacing = unit(10, "pt")
  )

Module 5 Visualization Lab

Tynan Sander