Plot 1: The Night Owl’s Diet

# Data preparation with better documentation
night_data <- transactions_sample %>%
  left_join(products, by = "product_id") %>%
  left_join(demographics, by = "household_id") %>%
  filter(!is.na(product_category)) %>%
  mutate(
    hour = hour(transaction_timestamp),
    time_period = if_else(hour >= 8 & hour < 20, "Day", "Night"),
    is_weekend = wday(transaction_timestamp) %in% c(1, 7)
  ) %>%
  group_by(product_category, time_period, is_weekend) %>%
  summarize(
    total_sales = sum(sales_value),
    n_transactions = n(),
    .groups = "drop"
  ) %>%
  group_by(time_period, is_weekend) %>%
  mutate(
    share = total_sales / sum(total_sales) * 100,
    period_total = sum(total_sales)
  ) %>%
  ungroup()

# Calculate differences and prepare for plotting
plot_data <- night_data %>%
  select(product_category, time_period, is_weekend, share, n_transactions) %>%
  pivot_wider(names_from = time_period, values_from = c(share, n_transactions)) %>%
  mutate(
    difference = share_Night - share_Day,
    avg_transactions = (n_transactions_Day + n_transactions_Night) / 2
  ) %>%
  filter(!is.na(share_Day) & !is.na(share_Night)) %>%
  group_by(is_weekend) %>%
  slice_max(order_by = avg_transactions, n = 8) %>%
  arrange(difference) %>%
  ungroup()

# Create improved plot
ggplot(plot_data, aes(x = difference, y = reorder(product_category, difference), fill = difference > 0)) +
  geom_col(alpha = 0.8) +
  geom_vline(xintercept = 0, color = "grey50", linetype = "solid") +
  geom_text(aes(label = sprintf("%+.1f%%", difference)), 
            hjust = ifelse(plot_data$difference > 0, -0.1, 1.1),
            size = 3) +
  facet_wrap(~if_else(is_weekend, "Weekend", "Weekday"), ncol = 1, scales = "free_y") +
  scale_fill_manual(values = c("#d62728", "#2ca02c"), guide = "none") +
  scale_x_continuous(labels = function(x) paste0(sprintf("%+.1f", x), "%")) +
  labs(
    title = "The Night Owl's Diet: Product Category Shifts from Day to Night",
    subtitle = "Change in share of total sales: Night (8 PM - 8 AM) minus Day (8 AM - 8 PM)\nTop 8 categories by transaction volume shown",
    x = "Change in Sales Share (Night - Day)",
    y = NULL,
    caption = "Source: Complete Journey sample data | Positive values = higher night sales\nBAG SNACKS show strongest night preference (+1.2% weekends); BEEF shifts to daytime"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16, hjust = 0.5),
    plot.subtitle = element_text(size = 12, hjust = 0.5, color = "grey40", margin = margin(b = 15)),
    plot.caption = element_text(color = "grey50", size = 9, hjust = 0),
    panel.grid.major.y = element_blank(),
    strip.text = element_text(face = "bold")
  )

Plot 2: The Private Label Conquest

# Improved private label analysis with clear definitions
private_data <- transactions_sample %>%
  left_join(products, by = "product_id") %>%
  filter(!is.na(department)) %>%
  mutate(
    is_private_label = str_detect(tolower(brand), "private|store|kroger|target|walmart")
  ) %>%
  # Calculate loyalty: % of households that bought category 2+ times
  group_by(department, household_id, is_private_label) %>%
  mutate(transaction_count = n()) %>%
  ungroup() %>%
  group_by(department, is_private_label) %>%
  summarize(
    total_sales = sum(sales_value),
    unique_households = n_distinct(household_id),
    repeat_buyers = sum(transaction_count > 1),
    .groups = "drop"
  ) %>%
  mutate(
    loyalty_rate = repeat_buyers / unique_households * 100
  ) %>%
  group_by(department) %>%
  mutate(
    department_sales = sum(total_sales),
    private_share = ifelse(is_private_label, total_sales / department_sales * 100, NA)
  ) %>%
  filter(is_private_label == TRUE & !is.na(private_share)) %>%
  filter(department_sales > 10000) %>%  # Meaningful departments only
  mutate(
    department = str_to_title(department),
    department = str_replace_all(department, "-", " ")
  )

# Calculate medians for reference lines
median_share <- median(private_data$private_share, na.rm = TRUE)
median_loyalty <- median(private_data$loyalty_rate, na.rm = TRUE)

# Create improved scatter plot
ggplot(private_data, aes(x = private_share, y = loyalty_rate, size = department_sales)) +
  geom_point(color = my_colors[3], alpha = 0.7) +
  geom_text_repel(aes(label = department), size = 3.2, max.overlaps = 20,
                  box.padding = 0.4, point.padding = 0.2) +
  # Reference lines with labels
  geom_hline(yintercept = median_loyalty, linetype = "dashed", color = "grey50", alpha = 0.8) +
  geom_vline(xintercept = median_share, linetype = "dashed", color = "grey50", alpha = 0.8) +
  annotate("text", x = median_share, y = max(private_data$loyalty_rate) * 0.95,
           label = paste("Median Share =", round(median_share, 1), "%"),
           hjust = -0.1, color = "grey40", size = 3) +
  annotate("text", y = median_loyalty, x = max(private_data$private_share) * 0.95,
           label = paste("Median Loyalty =", round(median_loyalty, 1), "%"),
           vjust = -0.5, color = "grey40", size = 3, angle = 90) +
  scale_size_area(
    name = "Department Sales",
    labels = dollar_format(scale = 1e-3, suffix = "K"),
    max_size = 15,
    breaks = c(50000, 100000, 200000)
  ) +
  scale_x_continuous(labels = function(x) paste0(round(x, 1), "%"), limits = c(0, NA)) +
  scale_y_continuous(labels = function(x) paste0(round(x, 1), "%"), limits = c(0, NA)) +
  labs(
    title = "The Private Label Conquest: Market Share vs Customer Loyalty",
    subtitle = "Private Label Share = % of department sales from store brands | Loyalty Rate = % of households with 2+ purchases\nBubble size represents total department sales",
    x = "Private Label Share of Department Sales (%)",
    y = "Customer Loyalty Rate (%)",
    caption = "Source: Complete Journey sample data | Fuel shows high share but low loyalty; Pastry excels in both metrics"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16, hjust = 0.5),
    plot.subtitle = element_text(size = 11, hjust = 0.5, color = "grey40", margin = margin(b = 15)),
    plot.caption = element_text(color = "grey50", size = 9, hjust = 0),
    legend.position = "bottom"
  )

Plot 3: The First-Basket Prophecy

# Improved first basket analysis with better documentation
first_basket_improved <- transactions_sample %>%
  group_by(household_id) %>%
  mutate(
    first_transaction_date = min(transaction_timestamp),
    is_first_transaction = transaction_timestamp == first_transaction_date
  ) %>%
  summarize(
    first_basket_value = sum(sales_value[is_first_transaction]),
    lifetime_value = sum(sales_value),
    n_transactions = n(),
    .groups = "drop"
  ) %>%
  filter(
    first_basket_value > 0,
    lifetime_value > 0,
    n_transactions >= 2  # At least 2 transactions to have "lifetime" behavior
  ) %>%
  mutate(
    first_basket_quintile = cut(
      first_basket_value,
      breaks = quantile(first_basket_value, probs = seq(0, 1, 0.2), na.rm = TRUE),
      labels = c("Q1 (Smallest\nFirst Basket)", "Q2", "Q3", "Q4", "Q5 (Largest\nFirst Basket)"),
      include.lowest = TRUE
    )
  ) %>%
  group_by(first_basket_quintile) %>%
  summarize(
    median_lifetime_value = median(lifetime_value),
    n_households = n(),
    se = sd(lifetime_value) / sqrt(n()),
    .groups = "drop"
  )

multiplier <- round(
  first_basket_improved$median_lifetime_value[5] / first_basket_improved$median_lifetime_value[1], 
  1
)

# Create dot plot (no segments)
ggplot(first_basket_improved, aes(x = median_lifetime_value, y = fct_rev(first_basket_quintile))) +
  geom_point(size = 6, color = my_colors[1]) +
  geom_errorbarh(aes(xmin = median_lifetime_value - 1.96 * se, 
                     xmax = median_lifetime_value + 1.96 * se),
                 height = 0.2, color = "grey50") +
  geom_text(aes(label = dollar(median_lifetime_value, accuracy = 1)), 
            hjust = -0.3, size = 3.5, fontface = "bold") +
  geom_text(aes(label = paste("n =", n_households)), 
            hjust = -0.3, vjust = 2, size = 3, color = "grey50") +
  scale_x_continuous(
    labels = dollar,
    expand = expansion(mult = c(0, 0.2))
  ) +
  labs(
    title = "The First-Basket Prophecy: Initial Purchase Predicts Lifetime Value",
    subtitle = "Households grouped by quintiles of first shopping basket value | Lifetime value = total spend across all transactions",
    x = "Median Customer Lifetime Value ($)",
    y = "First Basket Size Quintile",
    caption = paste("Source: Complete Journey sample data | Q5 households spend", multiplier, "times more than Q1\nIncludes only households with 2+ transactions; error bars show 95% confidence intervals")
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16, hjust = 0.5),
    plot.subtitle = element_text(size = 12, hjust = 0.5, color = "grey40", margin = margin(b = 15)),
    plot.caption = element_text(color = "grey50", size = 9, hjust = 0),
    panel.grid.major.y = element_blank(),
    axis.text.y = element_text(size = 10, lineheight = 1.1)
  )