Introduction

The objective of this project is to uncover unique insights within the CompleteJourney dataset through exploratory data visualization. By developing three informative and professionally formatted plots, the aim is to highlight meaningful patterns in coupon redemptions, campaign performance, and product-level behavior. Each visualization is designed to tell a clear story that can be quickly understood without requiring extensive background knowledge. My role will be to provide the necessary context about the dataset and guide the interpretation, while the visuals themselves remain simple, intuitive, and self-explanatory.

Packages and Importing Data

library(tidyverse)
library(completejourney)
library(patchwork)
library(forcats)
transactions <- get_transactions()
promotions <- get_promotions()

Coupon Redepmtion by Household Income

# Number of coupons by income
coupon_by_income <- coupon_redemptions %>%
  left_join(demographics, by = "household_id") %>%
  group_by(income) %>%
  summarise(redemptions = n()) %>%
  arrange(desc(redemptions))

graph_1 <- coupon_by_income %>%
  ggplot(aes(x = income, y = redemptions, fill = income)) +
  geom_col(show.legend = FALSE) +
  labs(x = "Income Level",
       y = "Number of Redemptions") +
  theme_minimal() +
  coord_flip()

# Number of coupons per campaign
coupons_per_campaign <- coupons %>%
  group_by(campaign_id) %>%
  summarise(coupons_in_campaign = n())

# Join with campaigns to get households
coupons_received <- campaigns %>%
  left_join(coupons_per_campaign, by = "campaign_id") %>%
  left_join(demographics, by = "household_id") %>%
  group_by(income) %>%
  summarise(received = sum(coupons_in_campaign))

# Number of coupons redeemed
coupons_redeemed <- coupon_redemptions %>%
  left_join(demographics, by = "household_id") %>%
  group_by(income) %>%
  summarise(redeemed = n())

# Coupon Utilization
coupon_utilization <- coupons_received %>%
  left_join(coupons_redeemed, by = "income") %>%
  mutate(utilization_rate = redeemed / received * 100)

graph_2 <- coupon_utilization %>%
  ggplot(aes(x = income, 
             y = utilization_rate, fill = income)) +
  geom_col(show.legend = FALSE) +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(x = "",
       y = "Utilization Rate (Redeemed / Received)") +
  theme_minimal() +
  coord_flip()

# Combing the two graphs and giving context
(graph_1 + graph_2) +
  plot_annotation(
    title = 'Not All Incomes Treat Coupons the Same',
    subtitle = 'While mid-income households drive the bulk of redemptions, utilization rates reveal a more nuanced story.\nHouseholds earning 150–174K redeemed fewer coupons overall but showed the highest utilization rate.',
    caption = "Source: CompleteJourney dataset") &
  theme(
    plot.title = element_text(face = "bold", size = 16)
  )

Coupon Redepmtion by Product Department

# Coupon Redemption by Products
coupon_redemptions_products <- coupon_redemptions %>%
  left_join(coupons, by = c("coupon_upc", "campaign_id")) %>%
  left_join(products, by = "product_id") %>%
  mutate(department = fct_explicit_na(department, na_level = "UNKNOWN"))

# Coupon Redemption by Product Department
redemptions_by_department <- coupon_redemptions_products %>%
  group_by(department) %>%
  summarise(total_redemptions = n()) %>%
  arrange(desc(total_redemptions))

graph_3 <- redemptions_by_department %>%
  ggplot(aes(x = reorder(department, total_redemptions), 
             y = total_redemptions, fill = department)) +
  geom_col(show.legend = FALSE) +
  scale_y_continuous(labels = scales::comma) +
  labs(x = "Product Department",
       y = "Total Redemptions") +
  theme_minimal() +
  coord_flip()

# Coupons Received by Department
coupons_received_dept <- coupons %>%
  left_join(products, by = "product_id") %>%
  group_by(department) %>%
  summarise(received = n(), .groups = "drop") %>%
  mutate(department = fct_explicit_na(department, na_level = "UNKNOWN"))

# Joining with Redemption
dept_utilization <- redemptions_by_department %>%
  left_join(coupons_received_dept, by = "department") %>%
  mutate(utilization_rate = total_redemptions / received)

graph_4 <- dept_utilization %>%
  ggplot(aes(x = reorder(department, utilization_rate),
             y = utilization_rate, fill = department)) +
  geom_col(show.legend = FALSE) +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(x = "",
       y = "Utilization Rate") +
  theme_minimal() +
  coord_flip()


# Combing the two graphs and giving context
(graph_3 + graph_4) +
  plot_annotation(
    title = "Are Coupons Always Effective?",
    subtitle = "Some departments redeem thousands of coupons, while others barely use what they receive.\nStaples like Dairy and Frozen drive large redemption counts, while other categories show higher utilization efficiency.",
    caption = "Source: CompleteJourney dataset") &
  theme(
    plot.title = element_text(face = "bold", size = 16)
  )

Campaign Success by Redmption and Utilization

# Coupon Catalog
coupon_catalog <- coupons %>%
  left_join(products, by = "product_id") %>%
  select(campaign_id, coupon_upc, department)

# Households per Campaign
households_per_campaign <- campaigns %>%
  group_by(campaign_id) %>%
  summarise(households = n_distinct(household_id))

# Coupons per Campaign
coupons_per_campaign <- coupons %>%
  group_by(campaign_id) %>%
  summarise(coupons_in_campaign = n_distinct(coupon_upc))

# Offers
offers <- coupons_per_campaign %>%
  left_join(households_per_campaign, by = "campaign_id") %>%
  mutate(received = coupons_in_campaign * households)

# Redemptions 
redemptions <- coupon_redemptions %>%
  distinct(household_id, campaign_id, coupon_upc) %>%  # ever redeemed per offer
  group_by(campaign_id) %>%
  summarise(redeemed = n())

# Campaign Metrics by ID
campaign_perf <- offers %>%
  left_join(redemptions, by = "campaign_id") %>%
  mutate(
    redeemed = replace_na(redeemed, 0),
    utilization_rate = redeemed / received
  )

graph_5 <- campaign_perf %>%
  ggplot(aes(x = reorder(campaign_id, redeemed), y = redeemed)) +
  geom_col(show.legend = FALSE) +
  labs(x = "Campaign ID",
       y = "Total Redemptions") +
  theme_minimal() +
  coord_flip()

graph_6 <- campaign_perf %>%
  mutate(campaign_id = factor(campaign_id, levels = c("24", "1", "25", "6", "15", 
                                                      "3", "21", "2", "7", "11", 
                                                      "23", "5", "4", "10", "26", 
                                                      "20", "12", "19", "14", "16",
                                                      "22", "9", "17", "27", "8", 
                                                      "13", "18"))) %>%
  ggplot(aes(x = campaign_id, y = utilization_rate)) +
  geom_col(show.legend = FALSE) +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(x = "",
       y = "Utilization Rate") +
  theme_minimal() +
  coord_flip()

# Combing the two graphs and giving context
(graph_5 + graph_6) +
  plot_annotation(
    title = "Finding the Most Effective Promotions",
    subtitle = "Some campaigns generated large redemption volumes, while others stood out for efficiency\nDespite millions of offers distributed, only a fraction were redeemed, showing wide variation in effectiveness.",
    caption = "Source: CompleteJourney dataset") &
  theme(
    plot.title = element_text(face = "bold", size = 16)
  )