R Notebook

library(completejourney)

## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

##--Data--
transactions <- transactions_sample
products <- products
demographics <- demographics
coupons <- coupons


##p1
##--Organization--
# Joining
joined_data <- transactions %>%
  inner_join(products, by = "product_id") %>%
  inner_join(demographics, by = "household_id") %>%
  left_join(coupons, by = "product_id")

## Warning in left_join(., coupons, by = "product_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 4744 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Define income ranges
income_ranges <- c("Under $25K", "$25K-$49K", "$50K-$74K", "$75K-$99K", "$100K-$124K", "$125K-$149K", "$150K-$174K", "$175K-$199K", "$200K and over")

# Map income values to ranges
joined_data <- joined_data %>%
  mutate(income_range = case_when(
    income == "1" ~ "Under $25K",
    income == "2" ~ "$25K-$49K",
    income == "3" ~ "$50K-$74K",
    income == "4" ~ "$75K-$99K",
    income == "5" ~ "$100K-$124K",
    income == "6" ~ "$125K-$149K",
    income == "7" ~ "$150K-$174K",
    income == "8" ~ "$175K-$199K",
    income == "9" ~ "$200K and over",
    TRUE ~ "Unknown"
  ))
# Aggregate top 5
top_products <- joined_data %>%
  group_by(product_id, product_category) %>%
  summarise(total_sales = sum(sales_value)) %>%
  top_n(5, total_sales)

## `summarise()` has grouped output by 'product_id'. You can override using the
## `.groups` argument.

# Filter only top 5 
promo_impact_top5 <- joined_data %>%
  filter(product_id %in% top_products$product_id) %>%
  mutate(promo_key = ifelse(!is.na(coupon_upc), "With Promotion", "Without Promotion"),
         income = ifelse(is.na(income) | income == "", "Unknown", income),
         product_category = ifelse(is.na(product_category) | product_category == "", "Unknown", product_category)) %>%
  filter(income != "Unknown" & product_category != "Unknown") %>%
  group_by(income, promo_key, product_category) %>%
  summarise(total_sales = sum(sales_value))

## `summarise()` has grouped output by 'income', 'promo_key'. You can override
## using the `.groups` argument.

##--Canvas--
ggplot(data = promo_impact_top5, aes(x = product_category, y = total_sales, fill = promo_key)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ income) +
  labs(title = "Impact of Promotions on the Top 5 Product Sales by Income Level",
       subtitle = "Total Sales Value for Top 5 Products",
       x = "Product Category",
       y = "Total Sales",
       caption = "Salary level 10 is the hi") +
  theme_minimal()

## p2
##--Data--
transactions <- transactions_sample
products <- products
demographics <- demographics
coupons <- coupons


transactions <- transactions %>%
  mutate(week = week(transaction_timestamp))

# Aggregate weekly sales data 
household_weekly_sales <- transactions %>%
  group_by(household_id, week) %>%
  summarise(
    total_quantity = sum(quantity, na.rm = TRUE),
    total_sales_value = sum(sales_value, na.rm = TRUE),
    trip_frequency = n_distinct(basket_id), # Count number of distinct trips (baskets)
    .groups = 'drop'
  )

# Aggregate total sales 
household_total_sales <- household_weekly_sales %>%
  group_by(household_id) %>%
  summarise(total_year_sales = sum(total_sales_value), .groups = 'drop') %>%
  arrange(desc(total_year_sales))

# Define top and bottom 10 households
top_10_households <- household_total_sales %>%
  top_n(10, total_year_sales) %>%
  mutate(category = "Top 10 Households")

bottom_10_households <- household_total_sales %>%
  top_n(-10, total_year_sales) %>%
  mutate(category = "Bottom 10 Households")

# Group
top_bottom_10 <- bind_rows(top_10_households, bottom_10_households)

# Filter
top_10_data <- household_weekly_sales %>%
  filter(household_id %in% top_10_households$household_id) %>%
  mutate(category = "Top 10 Households")

##--Canvas--
ggplot(top_10_data, aes(x = week)) +
  geom_line(aes(y = total_sales_value, color = "Sales Value", group = household_id), size = 1) +
  geom_line(aes(y = trip_frequency * 10, color = "Trip Frequency", group = household_id), linetype = "dashed", size = 1) + 
  labs(
    title = "Sales Value and Trip Frequency for Top 10 Households Over 52 Weeks",
    x = "Week",
    y = "Total Sales Value (Trip Frequency Fitted)"
  ) +
  scale_color_manual(values = c("Sales Value" = "blue", "Trip Frequency" = "red")) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 12, face = "bold"),
    axis.title.x = element_text(size = 12),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "bottom"
  )

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## p3 
# Summer Weeks
summer_weeks <- 22:34

# Filter summer weeks
summer_transactions <- transactions %>%
  filter(week %in% summer_weeks)

# Aggregate 
summer_summary <- summer_transactions %>%
  group_by(household_id) %>%
  summarise(total_sales_value = sum(sales_value))

# top 100 households based on total spending
top_100_households <- summer_summary %>%
  top_n(100, wt = total_sales_value)

top_100_summer_transactions <- summer_transactions %>%
  filter(household_id %in% top_100_households$household_id)

# Most Purchased & Value
top_departments_sales <- top_100_summer_transactions %>%
  left_join(products, by = "product_id") %>%
  group_by(department) %>%
  summarise(total_purchases = n(),
            total_sales_value = sum(sales_value)) %>%
  top_n(10, wt = total_purchases) %>%
  arrange(desc(total_purchases))

##--Canvas--
ggplot(top_departments_sales, aes(x = reorder(department, total_purchases), y = total_purchases)) +
  geom_bar(stat = "identity", aes(fill = total_sales_value)) +
  scale_fill_gradient(low = "blue", high = "red", name = "Sales Value") +
  labs(title = "Top 10 Product Departments for Top 100 Spending Households Over the Summer",
       x = "Product Department",
       y = "Total Purchases",
       caption = "Note: Top 100 households are our most spending customers") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))