library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
transactions <- get_transactions()
dim(transactions)
## [1] 1469307 11
promotions <- get_promotions()
dim(promotions)
## [1] 20940529 5
transactions %>%
inner_join(demographics, by = "household_id") %>%
tally()
## # A tibble: 1 × 1
## n
## <int>
## 1 828850
transactions %>%
anti_join(demographics, by = "household_id") %>%
tally()
## # A tibble: 1 × 1
## n
## <int>
## 1 640457
compute the total sales_value by age category to identify which age group generates the most sales.
transactions %>%
inner_join(demographics, by = "household_id") %>%
group_by(age) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
arrange(desc(total_sales))
## # A tibble: 6 × 2
## age total_sales
## <ord> <dbl>
## 1 45-54 971822.
## 2 35-44 724357.
## 3 25-34 453372.
## 4 65+ 176601.
## 5 55-64 173154.
## 6 19-24 125673.
hshld_1000 <- transactions %>%
group_by(household_id) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
filter(total_sales >= 1000)
hshld_1000 %>%
inner_join(demographics, by = "household_id") %>%
tally()
## # A tibble: 1 × 1
## n
## <int>
## 1 742
hshld_1000 %>%
anti_join(demographics, by = "household_id") %>%
tally()
## # A tibble: 1 × 1
## n
## <int>
## 1 636
hshld_1000 %>%
inner_join(demographics, by = "household_id") %>%
group_by(income) %>%
summarise(count = sum(total_sales >= 1000)) %>%
arrange(desc(count))
## # A tibble: 12 × 2
## income count
## <ord> <int>
## 1 50-74K 185
## 2 35-49K 154
## 3 75-99K 88
## 4 25-34K 69
## 5 15-24K 67
## 6 Under 15K 58
## 7 125-149K 36
## 8 100-124K 33
## 9 150-174K 27
## 10 175-199K 10
## 11 250K+ 10
## 12 200-249K 5
front_display_trans <- promotions %>%
filter(display_location == 1) %>%
inner_join(transactions, by = c('product_id', 'store_id', 'week'))
front_display_trans %>%
summarize(total_sales = sum(sales_value, na.rm = TRUE))
## # A tibble: 1 × 1
## total_sales
## <dbl>
## 1 24506.
front_display_trans %>%
group_by(product_id) %>%
summarize(total_front_display_sales = sum(sales_value, na.rm = TRUE)) %>%
arrange(desc(total_front_display_sales)) %>%
slice(1)
## # A tibble: 1 × 2
## product_id total_front_display_sales
## <chr> <dbl>
## 1 5569230 954.
pizza_products <- products %>%
filter(grepl("pizza", product_type, ignore.case = TRUE))
pizza_sales <- transactions %>%
inner_join(pizza_products, by = "product_id") %>%
group_by(product_id, product_type) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE), .groups = 'drop') %>%
arrange(desc(total_sales))
top_pizza_product <- pizza_sales %>%
slice(1)
pizza_sales
## # A tibble: 394 × 3
## product_id product_type total_sales
## <chr> <chr> <dbl>
## 1 944139 PIZZA/TRADITIONAL 1344.
## 2 906838 PIZZA/PREMIUM 1264.
## 3 12648296 PIZZA/TRADITIONAL 1230.
## 4 969568 PIZZA/TRADITIONAL 1125.
## 5 925626 PIZZA/ECONOMY 1018.
## 6 1127624 PIZZA/ECONOMY 849
## 7 1021116 PIZZA/TRADITIONAL 783.
## 8 1009368 PIZZA/PREMIUM 718.
## 9 9524291 PREP FD: PIZZA (COLD) 717.
## 10 995628 PIZZA/TRADITIONAL 703.
## # ℹ 384 more rows
top_pizza_product
## # A tibble: 1 × 3
## product_id product_type total_sales
## <chr> <chr> <dbl>
## 1 944139 PIZZA/TRADITIONAL 1344.
relevant_products <- products %>%
filter(
str_detect(product_category, regex("pizza", ignore_case = TRUE)),
str_detect(product_type, regex("snack|appetizer", ignore_case = TRUE))
)
relevant_products %>%
inner_join(transactions, by = 'product_id') %>%
group_by(product_id)%>%
summarise(total_qty = sum(quantity)) %>%
arrange(desc(total_qty))
## # A tibble: 188 × 2
## product_id total_qty
## <chr> <dbl>
## 1 845193 847
## 2 890695 718
## 3 907631 632
## 4 856252 479
## 5 1103105 276
## 6 1043494 241
## 7 999639 228
## 8 1035676 223
## 9 1123720 207
## 10 1054216 194
## # ℹ 178 more rows
pb <- products %>%
filter(str_detect(product_type, regex("peanut butter", ignore_case = TRUE)))
tally(pb)
## # A tibble: 1 × 1
## n
## <int>
## 1 144
pb %>%
inner_join(transactions, by = "product_id") %>%
group_by(month = month(transaction_timestamp, label = TRUE)) %>%
summarize(total_sales = sum(sales_value, na.rm = TRUE)) %>%
arrange(desc(total_sales))
## # A tibble: 12 × 2
## month total_sales
## <ord> <dbl>
## 1 Dec 1014.
## 2 Sep 903.
## 3 Aug 895.
## 4 Nov 886.
## 5 Oct 881.
## 6 Jun 872.
## 7 Jan 872.
## 8 Jul 858.
## 9 May 845.
## 10 Apr 794.
## 11 Feb 747.
## 12 Mar 735.
filtered_coupons <- coupon_redemptions %>%
filter(campaign_id == 18, coupon_upc == "10000085475")
coupon_transactions <- filtered_coupons %>%
inner_join(transactions, by = "household_id") %>%
distinct(household_id,, .keep_all = TRUE)
print(coupon_transactions)
## # A tibble: 63 × 14
## household_id coupon_upc campaign_id redemption_date store_id basket_id
## <chr> <chr> <chr> <date> <chr> <chr>
## 1 1453 10000085475 18 2017-10-31 31862 31225377936
## 2 2282 10000085475 18 2017-11-01 333 31242735818
## 3 2100 10000085475 18 2017-11-02 311 31316922938
## 4 58 10000085475 18 2017-11-03 299 31198665127
## 5 2200 10000085475 18 2017-11-03 334 31268690906
## 6 2451 10000085475 18 2017-11-03 299 31242600801
## 7 22 10000085475 18 2017-11-04 404 31281012062
## 8 909 10000085475 18 2017-11-04 404 31198770125
## 9 2168 10000085475 18 2017-11-04 306 31281162078
## 10 438 10000085475 18 2017-11-05 320 31769486443
## # ℹ 53 more rows
## # ℹ 8 more variables: product_id <chr>, quantity <dbl>, sales_value <dbl>,
## # retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>
same_day_transactions <- coupon_transactions %>%
filter(yday(redemption_date) == yday(transaction_timestamp))
total_sales_value <- same_day_transactions %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE))
total_sales_value
## # A tibble: 1 × 1
## total_sales
## <dbl>
## 1 0
filtered_coupons <- coupon_redemptions %>%
filter(campaign_id == 18, coupon_upc == "10000085475")
coupon_info <- filtered_coupons %>%
inner_join(coupons, by = c("campaign_id", "coupon_upc"))
## Warning in inner_join(., coupons, by = c("campaign_id", "coupon_upc")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 48409 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
coupon_products <- coupon_info %>%
inner_join(products, by = "product_id")
vegetable_products <- coupon_products %>%
filter(str_detect(product_category, regex("vegetables", ignore_case = TRUE)))
vegetable_transactions <- vegetable_products %>%
inner_join(transactions, by = c("household_id", "product_id"))
same_day_transactions <- vegetable_transactions %>%
filter(yday(redemption_date) == yday(transaction_timestamp))
total_sales_by_product_type <- same_day_transactions %>%
group_by(product_type) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
arrange(desc(total_sales))
total_sales_by_product_type
## # A tibble: 19 × 2
## product_type total_sales
## <chr> <dbl>
## 1 CELERY 25.4
## 2 HEAD LETTUCE 13.9
## 3 ORGANIC APPLES 10.5
## 4 CUCUMBERS 9.6
## 5 VARIETY LETTUCE 9.16
## 6 ORGANIC CARROTS 8.27
## 7 CUT VEGETABLES ALL OTHER 7.97
## 8 CABBAGE 6.61
## 9 VEGETABLES ALL OTHER 6.56
## 10 ASPARAGUS 5.93
## 11 ORGANIC HERBS 4.98
## 12 ORGANIC POTATOES 3.96
## 13 BEANS 3.29
## 14 CARROTS SLICED/SHRED/STICKS 2.5
## 15 SPINACH BULK 2.37
## 16 POTATOES PROCESSED 2.29
## 17 ORGANIC SQUASH 2.1
## 18 ORGANIC VEGETABLES ALL OTHERS 1.99
## 19 GREENS 0.61