Installing packages

# Install necessary package
install.packages('tidyverse')
install.packages('lubridate')
install.packages('completejourney')

Loading Libraries and Data

# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
# Get transactions data
transactions <- get_transactions()

# Check dimensions of transactions data
print(dim(transactions))  # Should output [1] 1469307 11
## [1] 1469307      11
# Get promotions data
promotions <- get_promotions()

# Check dimensions of promotions data
print(dim(promotions))  # Should output [1] 20940529 5
## [1] 20940529        5

Q1: Fill in the blanks with the correct join operations to answer the following questions. Using the transactions and demographics data, how many of the 1,469,307 transactions do we have demographic information for? How many of the transactions do we not have demographic information on?

# How many transactions do we have demographics on?
transactions %>%
  inner_join(demographics, by = "household_id") %>%
  tally()
## # A tibble: 1 × 1
##        n
##    <int>
## 1 828850
# How many transactions do we NOT have demographics on?
transactions %>%
  anti_join(demographics, by = "household_id") %>%
  tally()
## # A tibble: 1 × 1
##        n
##    <int>
## 1 640457

Q2: Fill in the blanks to perform an inner join with the transactions and demographics data. Then, compute the total sales_value by age category to identify which age group generates the most sales.

# Compute the total sales_value by age category
transactions %>%
  inner_join(demographics, by = "household_id") %>%
  group_by(age) %>%
  summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
  arrange(desc(total_sales))
## # A tibble: 6 × 2
##   age   total_sales
##   <ord>       <dbl>
## 1 45-54     971822.
## 2 35-44     724357.
## 3 25-34     453372.
## 4 65+       176601.
## 5 55-64     173154.
## 6 19-24     125673.

Q3: Identify all households that have total sales (sales_value) of $1000 or more. To do this, fill in the blanks to compute total sales by household ID and then filter for those household IDs that have total_sales equal to or greater than $1000.

# Identify households with $1000 or more in total sales
hshld_1000 <- transactions %>%
  group_by(household_id) %>%
  summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
  filter(total_sales >= 1000)

# How many of these households do we have demographic data on?
hshld_1000 %>%
  inner_join(demographics, by = "household_id") %>%
  tally()
## # A tibble: 1 × 1
##       n
##   <int>
## 1   742
# How many do we not have demographic on?
hshld_1000 %>%
  anti_join(demographics, by = "household_id") %>%
  tally()
## # A tibble: 1 × 1
##       n
##   <int>
## 1   636
# Which income range produces the most households that spend $1000 or more?
hshld_1000 %>%
  inner_join(demographics, by = "household_id") %>%
  group_by(income) %>%  # Use the corrected column name
  tally() %>%
  arrange(desc(n))
## # A tibble: 12 × 2
##    income        n
##    <ord>     <int>
##  1 50-74K      185
##  2 35-49K      154
##  3 75-99K       88
##  4 25-34K       69
##  5 15-24K       67
##  6 Under 15K    58
##  7 125-149K     36
##  8 100-124K     33
##  9 150-174K     27
## 10 175-199K     10
## 11 250K+        10
## 12 200-249K      5

Q4: Using the promotions and transactions data, compute the total sales for all products that were in a display in the front of the store (display_location = 1).

# Join transactions and filtered promotions data
front_display_trans <- promotions %>%
  filter(display_location == 1) %>%
  inner_join(transactions, by = c('product_id', 'store_id', 'week'))

# Total sales for all products displayed in the front of the store
front_display_trans %>%
  summarize(total_sales = sum(sales_value, na.rm = TRUE))
## # A tibble: 1 × 1
##   total_sales
##         <dbl>
## 1      24506.
# Identify the product displayed in the front of the store that had the largest total sales
front_display_trans %>%
  group_by(product_id) %>%
  summarize(total_front_display_sales = sum(sales_value, na.rm = TRUE)) %>%
  arrange(desc(total_front_display_sales))
## # A tibble: 2,604 × 2
##    product_id total_front_display_sales
##    <chr>                          <dbl>
##  1 5569230                         954.
##  2 5569471                         534.
##  3 8090537                         432.
##  4 8090521                         396.
##  5 1065538                         254.
##  6 5569845                         214.
##  7 8090532                         196.
##  8 878715                          191.
##  9 837569                          180.
## 10 5569374                         172.
## # ℹ 2,594 more rows

Q6: Identify all different products that contain “pizza” in their product_type description. Which of these products produces the greatest amount of total sales (compute total sales by product ID and product type)?

# Filter products with "pizza" in their product_type description
pizza_products <- products %>%
  filter(str_detect(product_type, regex("pizza", ignore_case = TRUE)))

# Join these pizza products with the transactions data to compute total sales
pizza_sales <- pizza_products %>%
  inner_join(transactions, by = "product_id") %>%
  group_by(product_id, product_type) %>%
  summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
  arrange(desc(total_sales))
## `summarise()` has grouped output by 'product_id'. You can override using the
## `.groups` argument.
# Display the product with the greatest total sales
top_pizza_product <- pizza_sales %>%
  slice_head(n = 1)
  
# Display pizza_products, pizza_sales, and top_pizza_product
pizza_products
## # A tibble: 532 × 7
##    product_id manufacturer_id department brand    product_category  product_type
##    <chr>      <chr>           <chr>      <fct>    <chr>             <chr>       
##  1 33061      754             GROCERY    National CONDIMENTS/SAUCES PIZZA SAUCE 
##  2 60879      822             NUTRITION  National FROZEN            FROZEN PIZZA
##  3 68688      1722            GROCERY    National FROZEN PIZZA      PIZZA/TRADI…
##  4 89652      1039            GROCERY    National FROZEN PIZZA      PIZZA/ECONO…
##  5 90602      1039            GROCERY    National FROZEN PIZZA      PIZZA/ECONO…
##  6 97125      1039            GROCERY    National FROZEN PIZZA      PIZZA/ECONO…
##  7 101531     1755            GROCERY    National FROZEN PIZZA      PIZZA/TRADI…
##  8 101722     1755            GROCERY    National FROZEN PIZZA      PIZZA/TRADI…
##  9 101723     1755            GROCERY    National FROZEN PIZZA      PIZZA/TRADI…
## 10 101785     1039            GROCERY    National FROZEN PIZZA      PIZZA/ECONO…
## # ℹ 522 more rows
## # ℹ 1 more variable: package_size <chr>
pizza_sales
## # A tibble: 394 × 3
## # Groups:   product_id [394]
##    product_id product_type          total_sales
##    <chr>      <chr>                       <dbl>
##  1 944139     PIZZA/TRADITIONAL           1345.
##  2 906838     PIZZA/PREMIUM               1264.
##  3 12648296   PIZZA/TRADITIONAL           1230.
##  4 969568     PIZZA/TRADITIONAL           1125.
##  5 925626     PIZZA/ECONOMY               1018.
##  6 1127624    PIZZA/ECONOMY                849 
##  7 1021116    PIZZA/TRADITIONAL            783.
##  8 1009368    PIZZA/PREMIUM                718.
##  9 9524291    PREP FD: PIZZA (COLD)        717.
## 10 995628     PIZZA/TRADITIONAL            703.
## # ℹ 384 more rows
top_pizza_product
## # A tibble: 394 × 3
## # Groups:   product_id [394]
##    product_id product_type                 total_sales
##    <chr>      <chr>                              <dbl>
##  1 1002610    PIZZA/TRADITIONAL                  19.0 
##  2 1004596    PIZZA/PREMIUM                     618.  
##  3 1005135    PIZZA/SINGLE SERVE/MICROWAVE        7.38
##  4 1009368    PIZZA/PREMIUM                     718.  
##  5 1013868    PIZZA/SINGLE SERVE/MICROWAVE       51   
##  6 1015134    PIZZA/TRADITIONAL                  80.3 
##  7 101531     PIZZA/TRADITIONAL                   8.65
##  8 1016715    PIZZA/PREMIUM                      73.0 
##  9 101722     PIZZA/TRADITIONAL                   1.65
## 10 101723     PIZZA/TRADITIONAL                   7   
## # ℹ 384 more rows

Q7: Fill in the blanks to identify all products that are categorized (product_category) as “pizza” but are considered a “snack” or “appetizer” (via product_type). Hint: the simplest way to do this is to filter first for pizza products and then second for products that are snacks or appetizers.

# Step 1: Filter for Relevant Products
relevant_products <- products %>%
  filter(
    str_detect(product_category, regex("pizza", ignore_case = TRUE)),
    str_detect(product_type, regex("snack|appetizer", ignore_case = TRUE))
  )

# Display relevant_products
relevant_products
## # A tibble: 225 × 7
##    product_id manufacturer_id department brand    product_category product_type 
##    <chr>      <chr>           <chr>      <fct>    <chr>            <chr>        
##  1 28892      69              GROCERY    Private  FROZEN PIZZA     SNACKS/APPET…
##  2 30002      69              GROCERY    Private  FROZEN PIZZA     SNACKS/APPET…
##  3 42687      1873            GROCERY    National FROZEN PIZZA     SNACKS/APPET…
##  4 80730      1142            GROCERY    National FROZEN PIZZA     SNACKS/APPET…
##  5 81421      1423            GROCERY    National FROZEN PIZZA     SNACKS/APPET…
##  6 83722      1039            GROCERY    National FROZEN PIZZA     SNACKS/APPET…
##  7 85737      1142            GROCERY    National FROZEN PIZZA     SNACKS/APPET…
##  8 92490      1039            GROCERY    National FROZEN PIZZA     SNACKS/APPET…
##  9 92902      1142            GROCERY    National FROZEN PIZZA     SNACKS/APPET…
## 10 95165      1423            GROCERY    National FROZEN PIZZA     SNACKS/APPET…
## # ℹ 215 more rows
## # ℹ 1 more variable: package_size <chr>
# Step 2: Compute the Total Quantity of Items Sold
# Compute total quantity of items sold by product ID
total_quantity_sold <- relevant_products %>%
  inner_join(transactions, by = 'product_id') %>%
  group_by(product_id) %>%
  summarise(total_qty = sum(quantity)) %>%
  arrange(desc(total_qty))

# Display total_quantity_sold
total_quantity_sold
## # A tibble: 188 × 2
##    product_id total_qty
##    <chr>          <dbl>
##  1 845193           847
##  2 890695           718
##  3 907631           632
##  4 856252           479
##  5 1103105          276
##  6 1043494          241
##  7 999639           228
##  8 1035676          223
##  9 1123720          207
## 10 1054216          194
## # ℹ 178 more rows

Q8: Identify all products that contain “peanut butter” in their product_type. How many unique products does this result in?

pb <- products %>%
  filter(str_detect(product_type, regex("peanut butter", ignore_case = TRUE)))

tally(pb)
## # A tibble: 1 × 1
##       n
##   <int>
## 1   144
pb %>%
  inner_join(transactions, by = "product_id") %>%
  group_by(month = month(transaction_timestamp, label = TRUE)) %>%
  summarize(total_sales = sum(sales_value, na.rm = TRUE)) %>%
  arrange(desc(total_sales))
## # A tibble: 12 × 2
##    month total_sales
##    <ord>       <dbl>
##  1 Dec         1014.
##  2 Sep          903.
##  3 Aug          895.
##  4 Nov          886.
##  5 Oct          881.
##  6 Jun          872.
##  7 Jan          872.
##  8 Jul          858.
##  9 May          845.
## 10 Apr          794.
## 11 Feb          747.
## 12 Mar          735.
# Display pb
pb
## # A tibble: 144 × 7
##    product_id manufacturer_id department brand    product_category  product_type
##    <chr>      <chr>           <chr>      <fct>    <chr>             <chr>       
##  1 41258      69              GROCERY    Private  PNT BTR/JELLY/JA… PEANUT BUTT…
##  2 43065      69              GROCERY    Private  PNT BTR/JELLY/JA… PEANUT BUTT…
##  3 45277      69              GROCERY    Private  PNT BTR/JELLY/JA… PEANUT BUTT…
##  4 49058      69              GROCERY    Private  PNT BTR/JELLY/JA… PEANUT BUTT…
##  5 155364     1179            GROCERY    National PNT BTR/JELLY/JA… PEANUT BUTT…
##  6 196521     1266            GROCERY    National PNT BTR/JELLY/JA… PEANUT BUTT…
##  7 197599     1266            GROCERY    National PNT BTR/JELLY/JA… PEANUT BUTT…
##  8 198548     1266            GROCERY    National PNT BTR/JELLY/JA… PEANUT BUTT…
##  9 198651     1266            GROCERY    National PNT BTR/JELLY/JA… PEANUT BUTT…
## 10 203890     1266            GROCERY    National PNT BTR/JELLY/JA… PEANUT BUTT…
## # ℹ 134 more rows
## # ℹ 1 more variable: package_size <chr>

Q9:Using the coupon_redemptions data, filter for the coupon associated with campaign_id 18 and coupon_upc “10000085475”. How many households redeemed this coupon? Now, using this coupon, identify the total sales_value for all transactions associated with the household_ids that redeemed this coupon on the same day they redeemed the coupon.

# Step 1: Filter coupon_redemptions for specific campaign_id and coupon_upc
filtered_coupons <- coupon_redemptions %>%
  filter(campaign_id == 18, coupon_upc == "10000085475")

# Step 2: Join with transactions data based on household_id
joined_data <- filtered_coupons %>%
  inner_join(transactions, by = "household_id")

# Step 3: Filter for transactions where redemption_date and transaction_timestamp are on the same day
same_day_transactions <- joined_data %>%
  filter(yday(redemption_date) == yday(transaction_timestamp))

# Step 4: Compute the total sales_value across these transactions
total_sales_value <- same_day_transactions %>%
  summarise(total_sales = sum(sales_value, na.rm = TRUE))

# Step to find out the number of households that redeemed this specific coupon
number_of_households <- filtered_coupons %>%
  summarise(n = n_distinct(household_id))

# Display the number of households and total sales value
print(paste("Number of households that redeemed the coupon:", number_of_households$n))
## [1] "Number of households that redeemed the coupon: 63"
print(paste("Total sales value on the same day of redemption:", total_sales_value$total_sales))
## [1] "Total sales value on the same day of redemption: 7047.59999999992"

Q10: Let’s build onto #9. Using the same redeemed coupon (campaign_id == “18” & coupon_upc == “10000085475”). In this problem we are going to calculate the total sales_value for each product_type that this coupon was applied to so that we can identify which product_type resulted in the greatest sales when associated with this coupon.

# Step a: Filter coupon_redemptions for specific campaign_id and coupon_upc
filtered_coupons <- coupon_redemptions %>%
  filter(campaign_id == 18, coupon_upc == "10000085475")

# Step b: Inner join with coupons to get the related product_id
joined_coupons <- filtered_coupons %>%
  inner_join(coupons, by = "coupon_upc", relationship = "many-to-many")

# Step c: Inner join with products to get product information
joined_products <- joined_coupons %>%
  inner_join(products, by = "product_id")

# Step d: Filter for 'vegetables' in the product_category
vegetable_products <- joined_products %>%
  filter(str_detect(product_category, regex("vegetables", ignore_case = TRUE)))

# Step e: Inner join with transactions using household_id and product_id
joined_transactions <- vegetable_products %>%
  inner_join(transactions, by = c("household_id", "product_id"))

# Step f: Filter for same day transactions
same_day_transactions <- joined_transactions %>%
  filter(yday(redemption_date) == yday(transaction_timestamp))

# Step g: Group by product_type
# Step h: Compute total sales_value
# Step i: Arrange by total sales_value
result <- same_day_transactions %>%
  group_by(product_type) %>%
  summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
  arrange(desc(total_sales))

# Display the product_type with the largest total sales value
print(result)
## # A tibble: 19 × 2
##    product_type                  total_sales
##    <chr>                               <dbl>
##  1 CELERY                              25.4 
##  2 HEAD LETTUCE                        13.9 
##  3 ORGANIC APPLES                      10.5 
##  4 CUCUMBERS                            9.6 
##  5 VARIETY LETTUCE                      9.16
##  6 ORGANIC CARROTS                      8.27
##  7 CUT VEGETABLES ALL OTHER             7.97
##  8 CABBAGE                              6.61
##  9 VEGETABLES ALL OTHER                 6.56
## 10 ASPARAGUS                            5.93
## 11 ORGANIC HERBS                        4.98
## 12 ORGANIC POTATOES                     3.96
## 13 BEANS                                3.29
## 14 CARROTS SLICED/SHRED/STICKS          2.5 
## 15 SPINACH BULK                         2.37
## 16 POTATOES PROCESSED                   2.29
## 17 ORGANIC SQUASH                       2.1 
## 18 ORGANIC VEGETABLES ALL OTHERS        1.99
## 19 GREENS                               0.61