Final Project Draft v2

# Libraries
library(completejourney)

## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(stringr)
library(ggrepel)
library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

# Get full datasets
transactions <- get_transactions()
promotions <- get_promotions()

#Narrowing down the dataset to a samples of 50 stores and assigning all endcap locations 
#to one variable

endcap_codes <- c("3", "4", "5", "6")

# Sample 50 stores due to processing constraints
set.seed(20260222) #So that the same 50 stores are chosen reproducibly

sample_stores <- promotions %>%
  distinct(store_id) %>%
  sample_n(100)

transactions_50 <- transactions %>%
  filter(store_id %in% sample_stores$store_id)

promo_type <- promotions %>%
  filter(store_id %in% sample_stores$store_id) %>%
  group_by(product_id, week, store_id)

df <- transactions_50 %>%
  left_join(products, by = "product_id") %>%
  left_join(promotions, by = c("product_id", "week", "store_id"), 
             relationship = "many-to-many") %>% #This is to make sure that transactions are not duplicated
  mutate(display_group = ifelse(display_location %in% endcap_codes, "endcap", "no_endcap"))

#KEY QUESTION: Can we identify endcap products that don't perform well to make
#a recommendation for better use of that space?

#Question 1: What department(s) get the most endcap exposure timewise?

endcap_time_by_dept <- df %>%
  filter(display_group == "endcap") %>%
  group_by(department) %>%
  summarize(n_store_weeks = n_distinct(str_c(store_id, week, sep = "_"))) %>%
    arrange(desc(n_store_weeks))

ggplot(endcap_time_by_dept, aes(x = reorder(department, n_store_weeks), y = n_store_weeks)) +
  geom_col() + 
  coord_flip() +
  labs(title = "Endcap Allocation by Department",
       subtitle = "Measured in store-weeks across 50 stores",
       x = "Department",
       y = "Store-weeks on endcap") +
  theme_minimal()

endcap_revenue_by_dept <- df %>%
   filter(display_group == "endcap", !is.na(department)) %>%
   group_by(department) %>%
   summarize(endcap_sales = sum(sales_value, na.rm = TRUE),
             n_store_weeks = n_distinct(str_c(store_id, week, sep = "_")),
             revenue_per_store_week = endcap_sales / n_store_weeks) %>%
     arrange(desc(revenue_per_store_week))

ggplot(endcap_revenue_by_dept, aes(x = reorder(department, revenue_per_store_week), 
                                   y = revenue_per_store_week)) +
  geom_col() + 
  coord_flip() +
  labs(title = "Endcap Revenue by Department",
       subtitle = "Measured across 50 stores",
       x = "Department",
       y = "Revenue from endcap sales") +
  theme_minimal()

#
scatter_data <- df %>%
  filter(display_group == "endcap",
         department %in% c("GROCERY", "DRUG GM"),
         !is.na(product_category)) %>%
  group_by(product_category, department) %>%
  summarise(
    avg_revenue_per_transaction = mean(sales_value, na.rm = TRUE),
    total_revenue = sum(sales_value, na.rm = TRUE),
    n_transactions = n(),
    n_store_weeks = n_distinct(str_c(store_id, week, sep = "_")),
    .groups = "drop"
  ) %>%
  filter(n_transactions >= 20)

ggplot(scatter_data, aes(x = n_store_weeks, 
                          y = avg_revenue_per_transaction,
                          color = department)) +
  geom_vline(xintercept = median(scatter_data$n_store_weeks), 
             linetype = "dashed", color = "gray50") +
  geom_hline(yintercept = median(scatter_data$avg_revenue_per_transaction), 
             linetype = "dashed", color = "gray50") +
  geom_point(aes(size = total_revenue), alpha = 0.7) +
  geom_text(aes(label = product_category), 
            size = 2.5, vjust = -0.8, check_overlap = TRUE) +
  annotate("text", x = max(scatter_data$n_store_weeks) * 0.75, 
           y = max(scatter_data$avg_revenue_per_transaction) * 0.95,
           label = "High Revenue\nLow Exposure", 
           color = "darkgreen", size = 3, fontface = "bold") +
  annotate("text", x = max(scatter_data$n_store_weeks) * 0.75,
           y = min(scatter_data$avg_revenue_per_transaction) * 1.0,
           label = "Low Revenue\nHigh Exposure", 
           color = "red", size = 3, fontface = "bold") +
  scale_size_continuous(name = "Total Revenue ($)") +
  labs(
    title = "Endcap Exposure vs Revenue per Transaction",
    subtitle = "Bubble size represents total revenue",
    x = "Endcap Store-Weeks (Exposure)",
    y = "Avg Revenue per Transaction ($)",
    color = "Department"
  ) +
  theme_minimal()

#MAKE THIS AN INTERACTIVE BUBBLE GRAPH WITH PLOTLY

#What are the best and worst performers on endcaps from the GROCERY department?

grocery_endcap <- df %>%
  filter(display_group == "endcap", department == "GROCERY") %>%
  group_by(product_category) %>%
  summarize(avg_rev_per_tx = mean(sales_value, na.rm = TRUE),
            total_revenue = sum(sales_value, na.rm = TRUE),
            n_transactions = n(),
            n_store_weeks = n_distinct(str_c(store_id, week, sep = "_"))) %>%
  filter(n_transactions >= 10) %>%
  arrange(desc(avg_rev_per_tx))

grocery_top_bottom <- rbind(
  head(grocery_endcap, 10) %>% mutate(group = "Top 10"),
  tail(grocery_endcap, 10) %>% mutate(group = "Bottom 10")
)

ggplot(grocery_top_bottom, aes(x = reorder(product_category, avg_rev_per_tx), 
                                y = avg_rev_per_tx,
                                fill = group)) +
  geom_col() +
  scale_fill_manual(values = c("Bottom 10" = "tomato", "Top 10" = "steelblue")) +
  coord_flip() +
  labs(title = "Grocery Department: Best vs Worst Endcap Performers",
       subtitle = "By average revenue per transaction",
       x = "Product Category",
       y = "Avg Revenue per Transaction ($)",
       fill = "") +
  theme_minimal()

grocery_endcap_raw <- df %>%
  filter(display_group == "endcap", department == "GROCERY",
         !is.na(product_category)) %>%
  group_by(product_category) %>%
  mutate(n_transactions = n()) %>%
  filter(n_transactions >= 10) %>%
  ungroup()

# Get top and bottom 10 categories by median
grocery_order <- grocery_endcap_raw %>%
  group_by(product_category) %>%
  summarise(med = median(sales_value)) %>%
  arrange(desc(med))

top_bottom <- c(head(grocery_order$product_category, 10),
                tail(grocery_order$product_category, 10))

grocery_endcap_raw %>%
  filter(product_category %in% top_bottom) %>%
  mutate(group = ifelse(product_category %in% head(grocery_order$product_category, 10),
                        "Top 10", "Bottom 10")) %>%
  ggplot(aes(x = reorder(product_category, sales_value, median),
             y = sales_value,
             fill = group)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(values = c("Bottom 10" = "tomato", "Top 10" = "steelblue")) +
  coord_flip() +
  labs(title = "Grocery Department: Best vs Worst Endcap Performers",
       subtitle = "Distribution of transaction values, 50 stores 2017",
       x = "Product Category",
       y = "Sales Value per Transaction ($)",
       fill = "") +
  theme_minimal()

drug_endcap_raw <- df %>%
  filter(display_group == "endcap", department == "DRUG GM",
         !is.na(product_type)) %>%
  group_by(product_type) %>%
  mutate(n_transactions = n()) %>%
  filter(n_transactions >= 5) %>% # low threshold - noted limitation
  ungroup()

drug_order <- drug_endcap_raw %>%
  group_by(product_type) %>%
  summarise(med = median(sales_value)) %>%
  arrange(desc(med))

top_bottom_drug <- c(head(drug_order$product_type, 10),
                     tail(drug_order$product_type, 10))

drug_endcap_raw %>%
  filter(product_type %in% top_bottom_drug) %>%
  mutate(group = ifelse(product_type %in% head(drug_order$product_type, 10),
                        "Top 10", "Bottom 10")) %>%
  ggplot(aes(x = reorder(product_type, sales_value, median),
             y = sales_value,
             fill = group)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(values = c("Bottom 10" = "tomato", "Top 10" = "steelblue")) +
  coord_flip() +
  labs(title = "Drug GM Department: Best vs Worst Endcap Performers",
       subtitle = "Distribution of transaction values, 50 stores 2017. Note: low transaction threshold.",
       x = "Product Type",
       y = "Sales Value per Transaction ($)",
       fill = "") +
  theme_minimal()

# How many total grocery endcap transactions do we have?
df %>%
  filter(display_group == "endcap", department == "GROCERY") %>%
  nrow()

## [1] 48996

# Compare to total grocery transactions
df %>%
  filter(department == "GROCERY") %>%
  nrow()

## [1] 829301

#What are the best and worst performers on endcaps from the DRUG GM department?

drug_endcap <- df %>%
  filter(display_group == "endcap", department == "DRUG GM") %>%
  group_by(product_type) %>%
  summarize(avg_rev_per_tx = mean(sales_value, na.rm = TRUE),
            total_revenue = sum(sales_value, na.rm = TRUE),
            n_transactions = n(),
            n_store_weeks = n_distinct(str_c(store_id, week, sep = "_"))) %>%
  filter(n_transactions >= 5) %>% #this is a MAJOR limitation
  arrange(desc(avg_rev_per_tx))

drug_top_bottom <- rbind(
  head(drug_endcap, 10) %>% mutate(group = "Top 10"),
  tail(drug_endcap, 10) %>% mutate(group = "Bottom 10")
)

ggplot(drug_top_bottom, aes(x = reorder(product_type, avg_rev_per_tx), y = avg_rev_per_tx)) +
  geom_col() +
    coord_flip() +
  labs(    
    title = "Drug Department: Best vs Worst Endcap Performers",
    subtitle = "By average revenue per transaction",
    x = "Product Category",
    y = "Avg Revenue per Transaction ($)")

categories_final <- c(
  "BEERS/ALES", "DOMESTIC WINE", "LAUNDRY DETERGENTS", "COFFEE", "PAPER TOWELS",
  "EGGS", "SOUP", "COLD CEREAL", "FLUID MILK PRODUCTS", "COOKIES/CONES",
  "DIAPERS & DISPOSABLES", "SOFT DRINKS", "FROZEN PIZZA",
  "CANDY - PACKAGED", "BAG SNACKS"
)

results <- data.frame()
for(cat in categories_final) {
  temp <- df %>%
    filter(store_id == "319", product_category == cat) %>%
    group_by(week) %>%
    summarise(total_weekly_sales = sum(sales_value, na.rm = TRUE),
              had_endcap = any(display_group == "endcap"),
              .groups = "drop") %>%
    mutate(week_type = ifelse(had_endcap, "Endcap Week", "No Display Week"),
           product_category = cat)
  results <- rbind(results, temp)
}

category_lift <- results %>%
  group_by(product_category, week_type) %>%
  summarise(
    avg_weekly_sales = mean(total_weekly_sales, na.rm = TRUE),
    n_weeks = n(),
    .groups = "drop"
  ) %>%
  pivot_wider(names_from = week_type,
              values_from = c(avg_weekly_sales, n_weeks)) %>%
  mutate(sales_lift_pct = (`avg_weekly_sales_Endcap Week` - `avg_weekly_sales_No Display Week`) /
                           `avg_weekly_sales_No Display Week` * 100) %>%
  filter(!is.na(sales_lift_pct),
         `n_weeks_Endcap Week` >= 4,
         `n_weeks_No Display Week` >= 4) %>%
  arrange(desc(sales_lift_pct))

category_lift

## # A tibble: 11 × 6
##    product_category `avg_weekly_sales_Endcap Week` avg_weekly_sales_No Display…¹
##    <chr>                                     <dbl>                         <dbl>
##  1 COFFEE                                    20.1                          10.8 
##  2 DOMESTIC WINE                             32.0                          18.7 
##  3 CANDY - PACKAGED                          10.9                           6.99
##  4 SOFT DRINKS                               47.1                          33.9 
##  5 BAG SNACKS                                21.3                          15.6 
##  6 COOKIES/CONES                              6.81                          5.45
##  7 SOUP                                      13.9                          11.8 
##  8 FROZEN PIZZA                              17.5                          15.7 
##  9 BEERS/ALES                                24.9                          22.7 
## 10 COLD CEREAL                               15.8                          17.6 
## 11 EGGS                                       3.32                          5.66
## # ℹ abbreviated name: ¹`avg_weekly_sales_No Display Week`
## # ℹ 3 more variables: `n_weeks_Endcap Week` <int>,
## #   `n_weeks_No Display Week` <int>, sales_lift_pct <dbl>

ggplot(category_lift, aes(x = reorder(product_category, sales_lift_pct),
                           y = sales_lift_pct,
                           fill = sales_lift_pct > 20)) +
  geom_col() +
  scale_fill_manual(values = c("TRUE" = "steelblue", "FALSE" = "tomato"),
                    guide = "none") +
  coord_flip() +
  geom_hline(yintercept = 20, linetype = "dashed", color = "gray50") +
  geom_text(aes(label = paste0(round(sales_lift_pct, 1), "%")), 
            hjust = -0.2, size = 3) +
  labs(
    title = "Endcap Sales Lift by Product Category at Store",
    subtitle = "Dashed line at 20% lift threshold. Red bars = low lift categories.",
    x = "Product Category",
    y = "Sales Lift (%)"
  ) +
  theme_minimal() +
  expand_limits(y = max(category_lift$sales_lift_pct) * 1.15)

categories_final <- c(
  "BEERS/ALES", "DOMESTIC WINE", "LAUNDRY DETERGENTS", "COFFEE", "PAPER TOWELS",
  "EGGS", "SOUP", "COLD CEREAL", "FLUID MILK PRODUCTS", "COOKIES/CONES",
  "DIAPERS & DISPOSABLES", "SOFT DRINKS", "FROZEN PIZZA",
  "CANDY - PACKAGED", "BAG SNACKS"
)

results <- data.frame()
for(cat in categories_final) {
  temp <- df %>%
    filter(store_id == "334", product_category == cat) %>%
    group_by(week) %>%
    summarise(total_weekly_sales = sum(sales_value, na.rm = TRUE),
              had_endcap = any(display_group == "endcap"),
              .groups = "drop") %>%
    mutate(week_type = ifelse(had_endcap, "Endcap Week", "No Display Week"),
           product_category = cat)
  results <- rbind(results, temp)
}

category_lift <- results %>%
  group_by(product_category, week_type) %>%
  summarise(
    avg_weekly_sales = mean(total_weekly_sales, na.rm = TRUE),
    n_weeks = n(),
    .groups = "drop"
  ) %>%
  pivot_wider(names_from = week_type,
              values_from = c(avg_weekly_sales, n_weeks)) %>%
  mutate(sales_lift_pct = (`avg_weekly_sales_Endcap Week` - `avg_weekly_sales_No Display Week`) /
                           `avg_weekly_sales_No Display Week` * 100) %>%
  filter(!is.na(sales_lift_pct),
         `n_weeks_Endcap Week` >= 4,
         `n_weeks_No Display Week` >= 4) %>%
  arrange(desc(sales_lift_pct))

category_lift

## # A tibble: 7 × 6
##   product_category `avg_weekly_sales_Endcap Week` avg_weekly_sales_No Display …¹
##   <chr>                                     <dbl>                          <dbl>
## 1 CANDY - PACKAGED                           8.67                           6.29
## 2 SOFT DRINKS                               36.1                           27.1 
## 3 COLD CEREAL                               12.9                            9.71
## 4 BAG SNACKS                                10.6                            8.61
## 5 FROZEN PIZZA                              16.0                           13.0 
## 6 SOUP                                       7.70                          10.2 
## 7 COOKIES/CONES                              3.76                           5.36
## # ℹ abbreviated name: ¹`avg_weekly_sales_No Display Week`
## # ℹ 3 more variables: `n_weeks_Endcap Week` <int>,
## #   `n_weeks_No Display Week` <int>, sales_lift_pct <dbl>

ggplot(category_lift, aes(x = reorder(product_category, sales_lift_pct),
                           y = sales_lift_pct,
                           fill = sales_lift_pct > 20)) +
  geom_col() +
  scale_fill_manual(values = c("TRUE" = "steelblue", "FALSE" = "tomato"),
                    guide = "none") +
  coord_flip() +
  geom_hline(yintercept = 20, linetype = "dashed", color = "gray50") +
  geom_text(aes(label = paste0(round(sales_lift_pct, 1), "%")), 
            hjust = -0.2, size = 3) +
  labs(
    title = "Endcap Sales Lift by Product Category at Store 334",
    subtitle = "Dashed line at 20% lift threshold. Red bars = low lift categories.",
    x = "Product Category",
    y = "Sales Lift (%)"
  ) +
  theme_minimal() +
  expand_limits(y = max(category_lift$sales_lift_pct) * 1.15)

# Find stores in our sample with good transaction counts
df %>%
  filter(product_category %in% categories_final) %>%
  group_by(store_id) %>%
  summarise(
    n_transactions = n(),
    n_categories = n_distinct(product_category),
    has_endcap = any(display_group == "endcap")
  ) %>%
  filter(has_endcap == TRUE) %>%
  arrange(desc(n_categories))

## # A tibble: 100 × 4
##    store_id n_transactions n_categories has_endcap
##    <chr>             <int>        <int> <lgl>     
##  1 289                2939           15 TRUE      
##  2 292                5596           15 TRUE      
##  3 296                2515           15 TRUE      
##  4 297                2262           15 TRUE      
##  5 299                1245           15 TRUE      
##  6 300                2000           15 TRUE      
##  7 304                3206           15 TRUE      
##  8 306                2170           15 TRUE      
##  9 309                1731           15 TRUE      
## 10 310                2284           15 TRUE      
## # ℹ 90 more rows

stores_to_check <- df %>%
  distinct(store_id) %>%
  pull(store_id)

all_results <- data.frame()
for(store in stores_to_check) {
  for(cat in categories_final) {
    temp <- df %>%
      filter(store_id == store, product_category == cat) %>%
      group_by(week) %>%
      summarise(total_weekly_sales = sum(sales_value, na.rm = TRUE),
                had_endcap = any(display_group == "endcap"),
                .groups = "drop") %>%
      mutate(week_type = ifelse(had_endcap, "Endcap Week", "No Display Week"),
             product_category = cat,
             store_id = store)
    all_results <- rbind(all_results, temp)
  }
}

# Get lift per store per category
lift_by_store <- all_results %>%
  group_by(store_id, product_category, week_type) %>%
  summarise(avg_weekly_sales = mean(total_weekly_sales, na.rm = TRUE),
            n_weeks = n(),
            .groups = "drop") %>%
  pivot_wider(names_from = week_type,
              values_from = c(avg_weekly_sales, n_weeks)) %>%
  mutate(sales_lift_pct = (`avg_weekly_sales_Endcap Week` - `avg_weekly_sales_No Display Week`) /
                           `avg_weekly_sales_No Display Week` * 100) %>%
  filter(!is.na(sales_lift_pct),
         `n_weeks_Endcap Week` >= 2,
         `n_weeks_No Display Week` >= 2)

# Then average lift across stores
lift_summary <- lift_by_store %>%
  group_by(product_category) %>%
  summarise(
    avg_lift = mean(sales_lift_pct),
    median_lift = median(sales_lift_pct),
    n_stores = n()
  ) %>%
  arrange(desc(avg_lift))

lift_summary

## # A tibble: 15 × 4
##    product_category      avg_lift median_lift n_stores
##    <chr>                    <dbl>       <dbl>    <int>
##  1 PAPER TOWELS             56.8        42.4        60
##  2 FROZEN PIZZA             56.3        50.8        95
##  3 SOUP                     49.4        42.9        93
##  4 DOMESTIC WINE            49.2        37.0        35
##  5 BEERS/ALES               48.0        40.0        66
##  6 COFFEE                   47.0        33.2        77
##  7 CANDY - PACKAGED         41.7        34.7        88
##  8 DIAPERS & DISPOSABLES    40.6        29.9        35
##  9 COOKIES/CONES            40.4        32.2        95
## 10 BAG SNACKS               39.7        32.1       100
## 11 COLD CEREAL              33.4        31.3        99
## 12 LAUNDRY DETERGENTS       24.6        10.6        77
## 13 SOFT DRINKS              23.3        19.9        87
## 14 EGGS                     14.0        10.2        24
## 15 FLUID MILK PRODUCTS       8.63        6.30       34

all_results_full <- data.frame()
for(store in stores_to_check) {
  temp <- df %>%
    filter(store_id == store, !is.na(product_category)) %>%
    group_by(week, product_category) %>%
    summarise(total_weekly_sales = sum(sales_value, na.rm = TRUE),
              had_endcap = any(display_group == "endcap"),
              .groups = "drop") %>%
    mutate(week_type = ifelse(had_endcap, "Endcap Week", "No Display Week"),
           store_id = store)
  all_results_full <- rbind(all_results_full, temp)
}

lift_full <- all_results_full %>%
  group_by(store_id, product_category, week_type) %>%
  summarise(avg_weekly_sales = mean(total_weekly_sales, na.rm = TRUE),
            n_weeks = n(),
            .groups = "drop") %>%
  pivot_wider(names_from = week_type,
              values_from = c(avg_weekly_sales, n_weeks)) %>%
  mutate(sales_lift_pct = (`avg_weekly_sales_Endcap Week` - `avg_weekly_sales_No Display Week`) /
                           `avg_weekly_sales_No Display Week` * 100) %>%
  filter(!is.na(sales_lift_pct),
         `n_weeks_Endcap Week` >= 5,
         `n_weeks_No Display Week` >= 5) %>%
group_by(product_category) %>%
  summarise(
    avg_lift = mean(sales_lift_pct),
    median_lift = median(sales_lift_pct),
    n_stores = n()
  ) %>%
  filter(n_stores >= 5) %>%
  arrange(desc(median_lift))

print(lift_full, n = Inf)

## # A tibble: 65 × 4
##    product_category               avg_lift median_lift n_stores
##    <chr>                             <dbl>       <dbl>    <int>
##  1 BAKING NEEDS                     157.        136.         15
##  2 FROZEN PIE/DESSERTS               83.6        76.7        35
##  3 VEGETABLES - SHELF STABLE         56.5        61.0        48
##  4 FROZEN PIZZA                      55.5        49.2        88
##  5 DOMESTIC WINE                     57.8        48.6        11
##  6 FRUIT - SHELF STABLE              45.8        44.3        13
##  7 MEAT - SHELF STABLE               47.9        43.5        53
##  8 CHIPS&SNACKS                      52.6        42.1         5
##  9 DRY BN/VEG/POTATO/RICE            43.4        41.8        13
## 10 YOGURT                            47.8        41.4        17
## 11 REFRGRATD DOUGH PRODUCTS          48.8        41.1         8
## 12 SOUP                              43.8        40.7        69
## 13 CRACKERS/MISC BKD FD              46.5        40.0        92
## 14 BEERS/ALES                        46.0        39.5        43
## 15 FRZN MEAT/MEAT DINNERS            46.1        39.3        86
## 16 CONVENIENT BRKFST/WHLSM SNACKS    37.8        39.1        60
## 17 PAPER TOWELS                      45.0        37.9        14
## 18 CANDY - PACKAGED                  42.1        37.8        63
## 19 MEAT - MISC                       40.4        37.3         9
## 20 ICE CREAM/MILK/SHERBTS            37.3        36.8        93
## 21 COFFEE                            38.8        36.2        28
## 22 BEANS - CANNED GLASS & MW         40.5        35.0        10
## 23 FRZN VEGETABLE/VEG DSH            36.1        34.9        58
## 24 COOKIES/CONES                     41.1        34.2        77
## 25 BAG SNACKS                        43.6        34.0        88
## 26 MILK BY-PRODUCTS                  36.7        33.8         6
## 27 CANNED JUICES                     34.7        33.1        58
## 28 DRY MIX DESSERTS                  57.7        32.7        18
## 29 COLD CEREAL                       36.3        32.6        87
## 30 WATER - CARBONATED/FLVRD DRINK    38.4        31.3        38
## 31 FRZN NOVELTIES/WTR ICE            40.2        31.2        63
## 32 CANDY - CHECKLANE                 41.6        30.6        40
## 33 FROZEN MEAT                       46.6        30.1        14
## 34 BUTTER                            49.3        29.0         6
## 35 BATH TISSUES                      35.1        28.6        60
## 36 WAREHOUSE SNACKS                  34.4        27.0        20
## 37 BAKING MIXES                      34.0        26.4        21
## 38 PASTA SAUCE                       30.2        26.4        11
## 39 SNACK NUTS                        24.7        25.8         6
## 40 ISOTONIC DRINKS                   30.2        25.5        41
## 41 CONDIMENTS/SAUCES                 30.3        25.4        43
## 42 FRZN POTATOES                     18.6        23.3        16
## 43 SUGARS/SWEETNERS                  28.9        21.4        11
## 44 SALD DRSNG/SNDWCH SPRD            27.1        20.9        68
## 45 SOFT DRINKS                       25.7        20.3        72
## 46 DINNER MXS:DRY                    22.2        19.4        34
## 47 PNT BTR/JELLY/JAMS                19.0        17.7        16
## 48 PAPER HOUSEWARES                  35.6        17.4         6
## 49 REFRGRATD JUICES/DRNKS            18.7        15.6        23
## 50 FD WRAPS/BAGS/TRSH BG             20.0        15.3         6
## 51 CHEESE                            17.0        14.6        28
## 52 BAKED BREAD/BUNS/ROLLS            11.2        14.1        31
## 53 LAUNDRY DETERGENTS                23.5        14.1        38
## 54 SOAP - LIQUID & BAR               16.0        12.6        14
## 55 HAIR CARE PRODUCTS                30.7        11.5        17
## 56 ORAL HYGIENE PRODUCTS             25.1        11.4        17
## 57 EGGS                              18.2        11.2        15
## 58 PWDR/CRYSTL DRNK MX               25.4        10.1        21
## 59 CEREAL/BREAKFAST                  13.8         9.28        9
## 60 SEAFOOD - FROZEN                  15.3         7.62        7
## 61 FLUID MILK PRODUCTS                9.44        6.85       31
## 62 DIAPERS & DISPOSABLES             19.2         3.44        5
## 63 COLD AND FLU                      31.9         1.62        5
## 64 FRZN BREAKFAST FOODS               4.24        1.49       29
## 65 SHORTENING/OIL                    -9.81      -17.9         8

lift_by_store %>%
  filter(product_category %in% lift_full$product_category) %>%
  ggplot(aes(x = reorder(product_category, sales_lift_pct, median), 
             y = sales_lift_pct)) +
  geom_boxplot() +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  coord_flip() +
  labs(title = "Distribution of Endcap Sales Lift Across Stores",
       x = "Product Category",
       y = "Sales Lift (%)")

mailer_quantity <- df %>%
  filter(display_group == "endcap",
         department %in% c("GROCERY", "DRUG GM"),
         !is.na(product_category)) %>%
  mutate(mailer_flag = ifelse(!is.na(mailer_location) & mailer_location != "0",
                               TRUE, FALSE)) %>%
  group_by(product_category, department, mailer_flag) %>%
  summarise(
    avg_revenue = mean(sales_value, na.rm = TRUE),
    avg_quantity = mean(quantity, na.rm = TRUE),
    n_transactions = n(),
    .groups = "drop"
  ) %>%
  pivot_wider(names_from = mailer_flag,
              values_from = c(avg_revenue, avg_quantity, n_transactions)) %>%
  rename(
    avg_revenue_endcap_only = avg_revenue_FALSE,
    avg_revenue_endcap_mailer = avg_revenue_TRUE,
    avg_quantity_endcap_only = avg_quantity_FALSE,
    avg_quantity_endcap_mailer = avg_quantity_TRUE
  ) %>%
  filter(!is.na(avg_revenue_endcap_only) & !is.na(avg_revenue_endcap_mailer)) %>%
  mutate(
    quantity_effect = avg_quantity_endcap_mailer - avg_quantity_endcap_only,
    revenue_effect = avg_revenue_endcap_mailer - avg_revenue_endcap_only
  )

p <- ggplot(mailer_quantity, aes(x = revenue_effect, 
                                  y = quantity_effect,
                                  color = department,
                                  text = product_category)) +
  geom_point(alpha = 0.7) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray50") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
  annotate("text", x = max(mailer_quantity$revenue_effect) * 0.7,
           y = max(mailer_quantity$quantity_effect) * 0.9,
           label = "More units\nMore revenue", color = "darkgreen", 
           size = 3, fontface = "bold") +
  annotate("text", x = min(mailer_quantity$revenue_effect) * 0.7,
           y = max(mailer_quantity$quantity_effect) * 0.9,
           label = "More units\nLess revenue", color = "steelblue", 
           size = 3, fontface = "bold") +
  annotate("text", x = max(mailer_quantity$revenue_effect) * 0.7,
           y = min(mailer_quantity$quantity_effect) * 0.9,
           label = "Fewer units\nMore revenue", color = "darkorange", 
           size = 3, fontface = "bold") +
  annotate("text", x = min(mailer_quantity$revenue_effect) * 0.7,
           y = min(mailer_quantity$quantity_effect) * 0.9,
           label = "Fewer units\nLess revenue", color = "red", 
           size = 3, fontface = "bold") +
  labs(
    title = "Mailer Effect on Quantity vs Revenue When on Endcap",
    subtitle = "Each point represents a product category. 50 stores, 2017.",
    x = "Change in Avg Revenue per Transaction ($)",
    y = "Change in Avg Quantity per Transaction",
    color = "Department"
  ) +
  theme_minimal()

ggplotly(p, tooltip = "text")

#SHOULD FILTER THIS DOWN TO SOMETHING LIKE TOP 10 BOTTOM 10


lift_by_store_full <- all_results_full %>%
  group_by(store_id, product_category, week_type) %>%
  summarise(avg_weekly_sales = mean(total_weekly_sales, na.rm = TRUE),
            n_weeks = n(),
            .groups = "drop") %>%
  pivot_wider(names_from = week_type,
              values_from = c(avg_weekly_sales, n_weeks)) %>%
  mutate(sales_lift_pct = (`avg_weekly_sales_Endcap Week` - `avg_weekly_sales_No Display Week`) /
                           `avg_weekly_sales_No Display Week` * 100) %>%
  filter(!is.na(sales_lift_pct),
         `n_weeks_Endcap Week` >= 5,
         `n_weeks_No Display Week` >= 5)

lift_by_store_full %>%
  filter(product_category %in% lift_full$product_category) %>%
  ggplot(aes(x = reorder(product_category, sales_lift_pct, median), 
             y = sales_lift_pct)) +
  geom_boxplot() +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  coord_flip() +
  labs(title = "Distribution of Endcap Sales Lift Across Stores",
       x = "Product Category",
       y = "Sales Lift (%)") +
  theme_minimal()

library(knitr)

lift_by_store_full %>%
  filter(product_category %in% lift_full$product_category) %>%
  group_by(product_category) %>%
  summarise(
    median_lift = median(sales_lift_pct),
    avg_lift = mean(sales_lift_pct),
    pct_stores_positive = mean(sales_lift_pct > 0) * 100,
    n_stores = n()
  ) %>%
  arrange(desc(median_lift)) %>%
  kable(digits = 1,
        col.names = c("Product Category", "Median Lift (%)", 
                      "Avg Lift (%)", "% Stores Positive", "N Stores"),
        caption = "Endcap Sales Lift by Product Category (All Stores)")

Endcap Sales Lift by Product Category (All Stores)
Product Category	Median Lift (%)	Avg Lift (%)	% Stores Positive	N Stores
BAKING NEEDS	136.2	157.3	100.0	15
FROZEN PIE/DESSERTS	76.7	83.6	85.7	35
VEGETABLES - SHELF STABLE	61.0	56.5	87.5	48
FROZEN PIZZA	49.2	55.5	94.3	88
DOMESTIC WINE	48.6	57.8	81.8	11
FRUIT - SHELF STABLE	44.3	45.8	100.0	13
MEAT - SHELF STABLE	43.5	47.9	96.2	53
CHIPS&SNACKS	42.1	52.6	100.0	5
DRY BN/VEG/POTATO/RICE	41.8	43.4	84.6	13
YOGURT	41.4	47.8	100.0	17
REFRGRATD DOUGH PRODUCTS	41.1	48.8	87.5	8
SOUP	40.7	43.8	97.1	69
CRACKERS/MISC BKD FD	40.0	46.5	92.4	92
BEERS/ALES	39.5	46.0	93.0	43
FRZN MEAT/MEAT DINNERS	39.3	46.1	93.0	86
CONVENIENT BRKFST/WHLSM SNACKS	39.1	37.8	86.7	60
PAPER TOWELS	37.9	45.0	85.7	14
CANDY - PACKAGED	37.8	42.1	76.2	63
MEAT - MISC	37.3	40.4	88.9	9
ICE CREAM/MILK/SHERBTS	36.8	37.3	89.2	93
COFFEE	36.2	38.8	85.7	28
BEANS - CANNED GLASS & MW	35.0	40.5	80.0	10
FRZN VEGETABLE/VEG DSH	34.9	36.1	86.2	58
COOKIES/CONES	34.2	41.1	88.3	77
BAG SNACKS	34.0	43.6	94.3	88
MILK BY-PRODUCTS	33.8	36.7	100.0	6
CANNED JUICES	33.1	34.7	82.8	58
DRY MIX DESSERTS	32.7	57.7	72.2	18
COLD CEREAL	32.6	36.3	89.7	87
WATER - CARBONATED/FLVRD DRINK	31.3	38.4	81.6	38
FRZN NOVELTIES/WTR ICE	31.2	40.2	88.9	63
CANDY - CHECKLANE	30.6	41.6	85.0	40
FROZEN MEAT	30.1	46.6	71.4	14
BUTTER	29.0	49.3	100.0	6
BATH TISSUES	28.6	35.1	81.7	60
WAREHOUSE SNACKS	27.0	34.4	70.0	20
BAKING MIXES	26.4	34.0	85.7	21
PASTA SAUCE	26.4	30.2	72.7	11
SNACK NUTS	25.8	24.7	66.7	6
ISOTONIC DRINKS	25.5	30.2	70.7	41
CONDIMENTS/SAUCES	25.4	30.3	86.0	43
FRZN POTATOES	23.3	18.6	68.8	16
SUGARS/SWEETNERS	21.4	28.9	90.9	11
SALD DRSNG/SNDWCH SPRD	20.9	27.1	80.9	68
SOFT DRINKS	20.3	25.7	79.2	72
DINNER MXS:DRY	19.4	22.2	70.6	34
PNT BTR/JELLY/JAMS	17.7	19.0	68.8	16
PAPER HOUSEWARES	17.4	35.6	66.7	6
REFRGRATD JUICES/DRNKS	15.6	18.7	73.9	23
FD WRAPS/BAGS/TRSH BG	15.3	20.0	83.3	6
CHEESE	14.6	17.0	78.6	28
BAKED BREAD/BUNS/ROLLS	14.1	11.2	74.2	31
LAUNDRY DETERGENTS	14.1	23.5	73.7	38
SOAP - LIQUID & BAR	12.6	16.0	64.3	14
HAIR CARE PRODUCTS	11.5	30.7	76.5	17
ORAL HYGIENE PRODUCTS	11.4	25.1	64.7	17
EGGS	11.2	18.2	80.0	15
PWDR/CRYSTL DRNK MX	10.1	25.4	71.4	21
CEREAL/BREAKFAST	9.3	13.8	77.8	9
SEAFOOD - FROZEN	7.6	15.3	57.1	7
FLUID MILK PRODUCTS	6.9	9.4	64.5	31
DIAPERS & DISPOSABLES	3.4	19.2	60.0	5
COLD AND FLU	1.6	31.9	60.0	5
FRZN BREAKFAST FOODS	1.5	4.2	51.7	29
SHORTENING/OIL	-17.9	-9.8	37.5	8

Final Project Draft v2

Erica Fatica

2026-02-22