Loading Libraries and Data
# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
# Get transactions data
transactions <- get_transactions()
# Check dimensions of transactions data
print(dim(transactions)) # Should output [1] 1469307 11
## [1] 1469307 11
# Get promotions data
promotions <- get_promotions()
# Check dimensions of promotions data
print(dim(promotions)) # Should output [1] 20940529 5
## [1] 20940529 5
Q3: Identify all households that have total sales (sales_value) of
$1000 or more. To do this, fill in the blanks to compute total sales by
household ID and then filter for those household IDs that have
total_sales equal to or greater than $1000.
# Identify households with $1000 or more in total sales
hshld_1000 <- transactions %>%
group_by(household_id) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
filter(total_sales >= 1000)
# How many of these households do we have demographic data on?
hshld_1000 %>%
inner_join(demographics, by = "household_id") %>%
tally()
## # A tibble: 1 × 1
## n
## <int>
## 1 742
# How many do we not have demographic on?
hshld_1000 %>%
anti_join(demographics, by = "household_id") %>%
tally()
## # A tibble: 1 × 1
## n
## <int>
## 1 636
# Which income range produces the most households that spend $1000 or more?
hshld_1000 %>%
inner_join(demographics, by = "household_id") %>%
group_by(income) %>% # Use the corrected column name
tally() %>%
arrange(desc(n))
## # A tibble: 12 × 2
## income n
## <ord> <int>
## 1 50-74K 185
## 2 35-49K 154
## 3 75-99K 88
## 4 25-34K 69
## 5 15-24K 67
## 6 Under 15K 58
## 7 125-149K 36
## 8 100-124K 33
## 9 150-174K 27
## 10 175-199K 10
## 11 250K+ 10
## 12 200-249K 5
Q6: Identify all different products that contain “pizza” in their
product_type description. Which of these products produces the greatest
amount of total sales (compute total sales by product ID and product
type)?
# Filter products with "pizza" in their product_type description
pizza_products <- products %>%
filter(str_detect(product_type, regex("pizza", ignore_case = TRUE)))
# Join these pizza products with the transactions data to compute total sales
pizza_sales <- pizza_products %>%
inner_join(transactions, by = "product_id") %>%
group_by(product_id, product_type) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
arrange(desc(total_sales))
## `summarise()` has grouped output by 'product_id'. You can override using the
## `.groups` argument.
# Display the product with the greatest total sales
top_pizza_product <- pizza_sales %>%
slice_head(n = 1)
# Display pizza_products, pizza_sales, and top_pizza_product
pizza_products
## # A tibble: 532 × 7
## product_id manufacturer_id department brand product_category product_type
## <chr> <chr> <chr> <fct> <chr> <chr>
## 1 33061 754 GROCERY National CONDIMENTS/SAUCES PIZZA SAUCE
## 2 60879 822 NUTRITION National FROZEN FROZEN PIZZA
## 3 68688 1722 GROCERY National FROZEN PIZZA PIZZA/TRADI…
## 4 89652 1039 GROCERY National FROZEN PIZZA PIZZA/ECONO…
## 5 90602 1039 GROCERY National FROZEN PIZZA PIZZA/ECONO…
## 6 97125 1039 GROCERY National FROZEN PIZZA PIZZA/ECONO…
## 7 101531 1755 GROCERY National FROZEN PIZZA PIZZA/TRADI…
## 8 101722 1755 GROCERY National FROZEN PIZZA PIZZA/TRADI…
## 9 101723 1755 GROCERY National FROZEN PIZZA PIZZA/TRADI…
## 10 101785 1039 GROCERY National FROZEN PIZZA PIZZA/ECONO…
## # ℹ 522 more rows
## # ℹ 1 more variable: package_size <chr>
pizza_sales
## # A tibble: 394 × 3
## # Groups: product_id [394]
## product_id product_type total_sales
## <chr> <chr> <dbl>
## 1 944139 PIZZA/TRADITIONAL 1345.
## 2 906838 PIZZA/PREMIUM 1264.
## 3 12648296 PIZZA/TRADITIONAL 1230.
## 4 969568 PIZZA/TRADITIONAL 1125.
## 5 925626 PIZZA/ECONOMY 1018.
## 6 1127624 PIZZA/ECONOMY 849
## 7 1021116 PIZZA/TRADITIONAL 783.
## 8 1009368 PIZZA/PREMIUM 718.
## 9 9524291 PREP FD: PIZZA (COLD) 717.
## 10 995628 PIZZA/TRADITIONAL 703.
## # ℹ 384 more rows
top_pizza_product
## # A tibble: 394 × 3
## # Groups: product_id [394]
## product_id product_type total_sales
## <chr> <chr> <dbl>
## 1 1002610 PIZZA/TRADITIONAL 19.0
## 2 1004596 PIZZA/PREMIUM 618.
## 3 1005135 PIZZA/SINGLE SERVE/MICROWAVE 7.38
## 4 1009368 PIZZA/PREMIUM 718.
## 5 1013868 PIZZA/SINGLE SERVE/MICROWAVE 51
## 6 1015134 PIZZA/TRADITIONAL 80.3
## 7 101531 PIZZA/TRADITIONAL 8.65
## 8 1016715 PIZZA/PREMIUM 73.0
## 9 101722 PIZZA/TRADITIONAL 1.65
## 10 101723 PIZZA/TRADITIONAL 7
## # ℹ 384 more rows
Q7: Fill in the blanks to identify all products that are categorized
(product_category) as “pizza” but are considered a “snack” or
“appetizer” (via product_type). Hint: the simplest way to do this is to
filter first for pizza products and then second for products that are
snacks or appetizers.
# Step 1: Filter for Relevant Products
relevant_products <- products %>%
filter(
str_detect(product_category, regex("pizza", ignore_case = TRUE)),
str_detect(product_type, regex("snack|appetizer", ignore_case = TRUE))
)
# Display relevant_products
relevant_products
## # A tibble: 225 × 7
## product_id manufacturer_id department brand product_category product_type
## <chr> <chr> <chr> <fct> <chr> <chr>
## 1 28892 69 GROCERY Private FROZEN PIZZA SNACKS/APPET…
## 2 30002 69 GROCERY Private FROZEN PIZZA SNACKS/APPET…
## 3 42687 1873 GROCERY National FROZEN PIZZA SNACKS/APPET…
## 4 80730 1142 GROCERY National FROZEN PIZZA SNACKS/APPET…
## 5 81421 1423 GROCERY National FROZEN PIZZA SNACKS/APPET…
## 6 83722 1039 GROCERY National FROZEN PIZZA SNACKS/APPET…
## 7 85737 1142 GROCERY National FROZEN PIZZA SNACKS/APPET…
## 8 92490 1039 GROCERY National FROZEN PIZZA SNACKS/APPET…
## 9 92902 1142 GROCERY National FROZEN PIZZA SNACKS/APPET…
## 10 95165 1423 GROCERY National FROZEN PIZZA SNACKS/APPET…
## # ℹ 215 more rows
## # ℹ 1 more variable: package_size <chr>
# Step 2: Compute the Total Quantity of Items Sold
# Compute total quantity of items sold by product ID
total_quantity_sold <- relevant_products %>%
inner_join(transactions, by = 'product_id') %>%
group_by(product_id) %>%
summarise(total_qty = sum(quantity)) %>%
arrange(desc(total_qty))
# Display total_quantity_sold
total_quantity_sold
## # A tibble: 188 × 2
## product_id total_qty
## <chr> <dbl>
## 1 845193 847
## 2 890695 718
## 3 907631 632
## 4 856252 479
## 5 1103105 276
## 6 1043494 241
## 7 999639 228
## 8 1035676 223
## 9 1123720 207
## 10 1054216 194
## # ℹ 178 more rows
Q8: Identify all products that contain “peanut butter” in their
product_type. How many unique products does this result in?
pb <- products %>%
filter(str_detect(product_type, regex("peanut butter", ignore_case = TRUE)))
tally(pb)
## # A tibble: 1 × 1
## n
## <int>
## 1 144
pb %>%
inner_join(transactions, by = "product_id") %>%
group_by(month = month(transaction_timestamp, label = TRUE)) %>%
summarize(total_sales = sum(sales_value, na.rm = TRUE)) %>%
arrange(desc(total_sales))
## # A tibble: 12 × 2
## month total_sales
## <ord> <dbl>
## 1 Dec 1014.
## 2 Sep 903.
## 3 Aug 895.
## 4 Nov 886.
## 5 Oct 881.
## 6 Jun 872.
## 7 Jan 872.
## 8 Jul 858.
## 9 May 845.
## 10 Apr 794.
## 11 Feb 747.
## 12 Mar 735.
# Display pb
pb
## # A tibble: 144 × 7
## product_id manufacturer_id department brand product_category product_type
## <chr> <chr> <chr> <fct> <chr> <chr>
## 1 41258 69 GROCERY Private PNT BTR/JELLY/JA… PEANUT BUTT…
## 2 43065 69 GROCERY Private PNT BTR/JELLY/JA… PEANUT BUTT…
## 3 45277 69 GROCERY Private PNT BTR/JELLY/JA… PEANUT BUTT…
## 4 49058 69 GROCERY Private PNT BTR/JELLY/JA… PEANUT BUTT…
## 5 155364 1179 GROCERY National PNT BTR/JELLY/JA… PEANUT BUTT…
## 6 196521 1266 GROCERY National PNT BTR/JELLY/JA… PEANUT BUTT…
## 7 197599 1266 GROCERY National PNT BTR/JELLY/JA… PEANUT BUTT…
## 8 198548 1266 GROCERY National PNT BTR/JELLY/JA… PEANUT BUTT…
## 9 198651 1266 GROCERY National PNT BTR/JELLY/JA… PEANUT BUTT…
## 10 203890 1266 GROCERY National PNT BTR/JELLY/JA… PEANUT BUTT…
## # ℹ 134 more rows
## # ℹ 1 more variable: package_size <chr>
Q9:Using the coupon_redemptions data, filter for the coupon
associated with campaign_id 18 and coupon_upc “10000085475”. How many
households redeemed this coupon? Now, using this coupon, identify the
total sales_value for all transactions associated with the household_ids
that redeemed this coupon on the same day they redeemed the coupon.
# Step 1: Filter coupon_redemptions for specific campaign_id and coupon_upc
filtered_coupons <- coupon_redemptions %>%
filter(campaign_id == 18, coupon_upc == "10000085475")
# Step 2: Join with transactions data based on household_id
joined_data <- filtered_coupons %>%
inner_join(transactions, by = "household_id")
# Step 3: Filter for transactions where redemption_date and transaction_timestamp are on the same day
same_day_transactions <- joined_data %>%
filter(yday(redemption_date) == yday(transaction_timestamp))
# Step 4: Compute the total sales_value across these transactions
total_sales_value <- same_day_transactions %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE))
# Step to find out the number of households that redeemed this specific coupon
number_of_households <- filtered_coupons %>%
summarise(n = n_distinct(household_id))
# Display the number of households and total sales value
print(paste("Number of households that redeemed the coupon:", number_of_households$n))
## [1] "Number of households that redeemed the coupon: 63"
print(paste("Total sales value on the same day of redemption:", total_sales_value$total_sales))
## [1] "Total sales value on the same day of redemption: 7047.59999999992"
Q10: Let’s build onto #9. Using the same redeemed coupon
(campaign_id == “18” & coupon_upc == “10000085475”). In this problem
we are going to calculate the total sales_value for each product_type
that this coupon was applied to so that we can identify which
product_type resulted in the greatest sales when associated with this
coupon.
# Step a: Filter coupon_redemptions for specific campaign_id and coupon_upc
filtered_coupons <- coupon_redemptions %>%
filter(campaign_id == 18, coupon_upc == "10000085475")
# Step b: Inner join with coupons to get the related product_id
joined_coupons <- filtered_coupons %>%
inner_join(coupons, by = "coupon_upc", relationship = "many-to-many")
# Step c: Inner join with products to get product information
joined_products <- joined_coupons %>%
inner_join(products, by = "product_id")
# Step d: Filter for 'vegetables' in the product_category
vegetable_products <- joined_products %>%
filter(str_detect(product_category, regex("vegetables", ignore_case = TRUE)))
# Step e: Inner join with transactions using household_id and product_id
joined_transactions <- vegetable_products %>%
inner_join(transactions, by = c("household_id", "product_id"))
# Step f: Filter for same day transactions
same_day_transactions <- joined_transactions %>%
filter(yday(redemption_date) == yday(transaction_timestamp))
# Step g: Group by product_type
# Step h: Compute total sales_value
# Step i: Arrange by total sales_value
result <- same_day_transactions %>%
group_by(product_type) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
arrange(desc(total_sales))
# Display the product_type with the largest total sales value
print(result)
## # A tibble: 19 × 2
## product_type total_sales
## <chr> <dbl>
## 1 CELERY 25.4
## 2 HEAD LETTUCE 13.9
## 3 ORGANIC APPLES 10.5
## 4 CUCUMBERS 9.6
## 5 VARIETY LETTUCE 9.16
## 6 ORGANIC CARROTS 8.27
## 7 CUT VEGETABLES ALL OTHER 7.97
## 8 CABBAGE 6.61
## 9 VEGETABLES ALL OTHER 6.56
## 10 ASPARAGUS 5.93
## 11 ORGANIC HERBS 4.98
## 12 ORGANIC POTATOES 3.96
## 13 BEANS 3.29
## 14 CARROTS SLICED/SHRED/STICKS 2.5
## 15 SPINACH BULK 2.37
## 16 POTATOES PROCESSED 2.29
## 17 ORGANIC SQUASH 2.1
## 18 ORGANIC VEGETABLES ALL OTHERS 1.99
## 19 GREENS 0.61