w5.R

# Load necessary libraries
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.1.2

## Warning: package 'tibble' was built under R version 4.1.2

## Warning: package 'tidyr' was built under R version 4.1.2

## Warning: package 'readr' was built under R version 4.1.2

## Warning: package 'purrr' was built under R version 4.1.2

## Warning: package 'dplyr' was built under R version 4.1.2

## Warning: package 'forcats' was built under R version 4.1.2

## Warning: package 'lubridate' was built under R version 4.1.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

library(completejourney)

## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.

##Plot 1: Top 5 product departments by income level (sales per quantity)
# Join transactions, demographics, and products
plot1_data <- transactions_sample %>%
  left_join(demographics, by = "household_id") %>%  # Add household info
  left_join(products, by = "product_id") %>%        # Add product info
  group_by(income, department) %>%                  # Group by income and department
  summarise(total_sales = sum(sales_value),
            total_qty = sum(quantity),
            .groups = "drop") %>%
  group_by(income) %>%
  slice_max(order_by = total_sales, n = 5)          # Keep top 5 departments per income

# Plot
ggplot(plot1_data, aes(x = reorder(department, -total_sales), 
                       y = total_sales/total_qty, fill = income)) +
  geom_col(position = position_dodge()) +
  labs(
    title = "Income vs Top 5 Product Departments by Sales per Quantity",
    subtitle = "Shows which product departments generate most revenue per item by income level",
    x = "Product Department",
    y = "Sales Value per Quantity",
    fill = "Income Level",
    caption = "Data: completejourney transactions, demographics, products"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Plot 2: Household size vs weekday/weekend shopping behavior
plot2_data <- transactions_sample %>%
  left_join(demographics, by = "household_id") %>%
  mutate(day_type = ifelse(wday(week) %in% c(1,7), "Weekend", "Weekday")) %>%
  group_by(household_id, household_size, day_type) %>%
  summarise(
    total_sales = sum(sales_value, na.rm = TRUE),
    total_qty = sum(quantity, na.rm = TRUE),
    avg_sales_per_item = total_sales / total_qty,
    .groups = "drop"
  )

# plot
ggplot(plot2_data, aes(x = factor(household_size), y = avg_sales_per_item, fill = day_type)) +
  geom_boxplot(outlier.color = "red", alpha = 0.6) +
  labs(
    title = "Household Size vs Average Sales per Item",
    subtitle = "Distribution across weekdays and weekends",
    x = "Household Size",
    y = "Average Sales Value per Item",
    fill = "Day Type",
    caption = "Data: completejourney transactions and demographics"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold"),
    axis.text.x = element_text(angle = 0, vjust = 0.5)
  )

## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

##Plot 3: Store type vs total sales by customer age group
household_coupons <- campaigns %>%
  left_join(coupons, by = "campaign_id")

## Warning in left_join(., coupons, by = "campaign_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 88259 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

coupons_per_household <- household_coupons %>%
  group_by(household_id) %>%
  summarise(received = n(), .groups = "drop")
redemptions_per_household <- coupon_redemptions %>%
  group_by(household_id) %>%
  summarise(redeemed = n(), .groups = "drop")
coupon_stats <- coupons_per_household %>%
  left_join(redemptions_per_household, by = "household_id") %>%
  mutate(redeemed = replace_na(redeemed, 0),
         redemption_rate = redeemed / received)
coupon_stats <- coupon_stats %>%
  left_join(demographics %>% select(household_id, income), by = "household_id")
avg_redemption_by_income <- coupon_stats %>%
  group_by(income) %>%
  summarise(avg_rate = mean(redemption_rate, na.rm = TRUE)) %>%
  arrange(income)

#plot
ggplot(avg_redemption_by_income, aes(x = income, y = avg_rate)) +
  geom_col(fill = "steelblue") +
  labs(title = "Average Coupon Redemption Rate by Household Income",
       x = "Income Level",
       y = "Average Redemption Rate") +
  theme_minimal()

w5.R

shixinran

2025-09-26