Load Packages and Data

library(completejourney)
library(tidyverse)

data("transactions_sample")
data("products")
data("demographics")
data("campaigns")

transactions <- transactions_sample
products <- products
demographics <- demographics
campaigns <- campaigns

head(transactions)
## # A tibble: 6 × 11
##   household_id store_id basket_id   product_id quantity sales_value retail_disc
##   <chr>        <chr>    <chr>       <chr>         <dbl>       <dbl>       <dbl>
## 1 2261         309      31625220889 940996            1        3.86        0.43
## 2 2131         368      32053127496 873902            1        1.59        0.9 
## 3 511          316      32445856036 847901            1        1           0.69
## 4 400          388      31932241118 13094913          2       11.9         2.9 
## 5 918          340      32074655895 1085604           1        1.29        0   
## 6 718          324      32614612029 883203            1        2.5         0.49
## # ℹ 4 more variables: coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## #   transaction_timestamp <dttm>
head(products)
## # A tibble: 6 × 7
##   product_id manufacturer_id department    brand   product_category product_type
##   <chr>      <chr>           <chr>         <fct>   <chr>            <chr>       
## 1 25671      2               GROCERY       Nation… FRZN ICE         ICE - CRUSH…
## 2 26081      2               MISCELLANEOUS Nation… <NA>             <NA>        
## 3 26093      69              PASTRY        Private BREAD            BREAD:ITALI…
## 4 26190      69              GROCERY       Private FRUIT - SHELF S… APPLE SAUCE 
## 5 26355      69              GROCERY       Private COOKIES/CONES    SPECIALTY C…
## 6 26426      69              GROCERY       Private SPICES & EXTRAC… SPICES & SE…
## # ℹ 1 more variable: package_size <chr>
head(demographics)
## # A tibble: 6 × 8
##   household_id age   income    home_ownership marital_status household_size
##   <chr>        <ord> <ord>     <ord>          <ord>          <ord>         
## 1 1            65+   35-49K    Homeowner      Married        2             
## 2 1001         45-54 50-74K    Homeowner      Unmarried      1             
## 3 1003         35-44 25-34K    <NA>           Unmarried      1             
## 4 1004         25-34 15-24K    <NA>           Unmarried      1             
## 5 101          45-54 Under 15K Homeowner      Married        4             
## 6 1012         35-44 35-49K    <NA>           Married        5+            
## # ℹ 2 more variables: household_comp <ord>, kids_count <ord>
head(campaigns)
## # A tibble: 6 × 2
##   campaign_id household_id
##   <chr>       <chr>       
## 1 1           105         
## 2 1           1238        
## 3 1           1258        
## 4 1           1483        
## 5 1           2200        
## 6 1           293

Plot 1: Top Product Categories by Sales in Each Income Bracket

transactions_joined <- transactions %>%
  left_join(products, by = "product_id") %>%
  left_join(demographics, by = "household_id") %>%
  filter(!is.na(income) & !is.na(product_category)) 



top_categories <- transactions_joined %>%
  group_by(income, product_category) %>%  
  summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
  group_by(income) %>%
  slice_max(total_sales, n = 3, with_ties = FALSE) %>%
  ungroup() %>%
  mutate(product_category = fct_reorder(product_category, total_sales))  
ggplot(top_categories, aes(x = total_sales, y = product_category)) +  
  geom_col(fill = "steelblue", width = 0.8) +
  facet_wrap(~income, scales = "free_y", ncol = 3) +
  labs(
    title = "Top-Selling Product Categories by Income Bracket",
    subtitle = "The three product categories with the highest total sales in each income group.",
    x = "Total Sales Value ($)",
    y = NULL,
    caption = "Data: CompleteJourney"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
    axis.text.y = element_text(size = 8),
    strip.text = element_text(size = 10, face = "bold"),
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 10),
    panel.spacing = unit(1.5, "lines")
  )


Plot 2: Average Sales per Household Member by Household Size

transactions_demo <- transactions %>%
  left_join(demographics, by = "household_id") %>%
  mutate(
    household_size = factor(household_size, levels = c("1", "2", "3", "4", "5+")),
    household_size_numeric = case_when(
      household_size == "5+" ~ 5,
      TRUE ~ as.numeric(as.character(household_size))
    )
  )


household_sales <- transactions_demo %>%
  group_by(household_id, household_size, household_size_numeric) %>%
  summarise(total_sales = sum(sales_value)) %>%
  mutate(sales_per_member = total_sales / household_size_numeric)


sales_summary <- household_sales %>%
  group_by(household_size) %>%
  summarise(avg_sales_per_member = mean(sales_per_member))
ggplot(sales_summary, aes(x = household_size, y = avg_sales_per_member)) +
  geom_col(fill = "darkorange", alpha = 0.8) +
  labs(
    title = "Average Sales per Household Member",
    subtitle = "Total household sales divided by the number of members in each household size group.",
    x = "Household Size",
    y = "Average Sales per Member ($)",
    caption = "Data: CompleteJourney"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 10),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )


Plot 3: Campaign Participation Impact by Income Bracket

data("campaigns")
campaign_participants <- campaigns %>%
  distinct(household_id) %>%
  mutate(participated = TRUE)


transactions_campaign <- transactions %>%
  left_join(campaign_participants, by = "household_id") %>%
  mutate(participated = if_else(is.na(participated), FALSE, participated)) %>%
  left_join(demographics, by = "household_id")



household_sales_campaign <- transactions_campaign %>%
  group_by(household_id, income, participated) %>%
  summarise(total_sales = sum(sales_value)) %>%
  ungroup()


avg_sales_campaign <- household_sales_campaign %>%
  group_by(income, participated) %>%
  summarise(avg_sales = mean(total_sales)) %>%
  ungroup()
ggplot(avg_sales_campaign, aes(x = income, y = avg_sales, fill = participated)) +
  geom_col(position = "dodge", width = 0.7) +
  scale_fill_manual(
    values = c("FALSE" = "#999999", "TRUE" = "#009E73"),
    labels = c("Non-Participant", "Participant")
  ) +
  labs(
    title = "Impact of Campaign Participation on Spending",
    subtitle = "Comparison of average total sales between campaign participants and non-participants across income levels",
    x = "Income Bracket",
    y = "Average Total Sales ($)",
    fill = "Campaign Participation",
    caption = "Data: CompleteJourney"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top",
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 10)
  )