### Module 4
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
transactions <- get_transactions()
dim(transactions)
## [1] 1469307 11
## [1] 1469307 11
promotions <- get_promotions()
dim(promotions)
## [1] 20940529 5
## [1] 20940529 5
# Question 1 ----------------------------------------------------------------------
### Fill in the blanks with the correct join operations to answer the following questions.
### Using the transactions and demographics data,
### how many of the 1,469,307 transactions do we have demographic
### information for? How many of the transactions do we not have
### demographic information on?
# how many transaction do we have demographics on?
transactions %>%
inner_join(demographics, by = "household_id") %>%
tally()
## # A tibble: 1 × 1
## n
## <int>
## 1 828850
# how many transaction do we NOT have demographics on?
transactions %>%
anti_join(demographics, by = "household_id") %>%
tally()
## # A tibble: 1 × 1
## n
## <int>
## 1 640457
# Question 2 ----------------------------------------------------------------------
## Fill in the blanks to perform an inner join
## with the transactions and demographics data. Then,
## compute the total sales_value by age category to
## identify which age group generates the most sales.
transactions %>%
inner_join(demographics, by = "household_id") %>%
group_by(age) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
arrange(desc(total_sales))
## # A tibble: 6 × 2
## age total_sales
## <ord> <dbl>
## 1 45-54 971822.
## 2 35-44 724357.
## 3 25-34 453372.
## 4 65+ 176601.
## 5 55-64 173154.
## 6 19-24 125673.
# Question 3 ----------------------------------------------------------------------
## Identify all households that have total sales (sales_value) of $1000 or more.
## To do this, fill in the blanks to compute total sales by household ID
## and then filter for those household IDs that have total_sales
## equal to or greater than $1000.
# Identify households with $1000 or more in total sales
hshld_1000 <- transactions %>%
group_by(household_id) %>%
summarize(total_sales = sum(sales_value, na.rm = TRUE)) %>%
filter(total_sales >= 1000)
hshld_1000
## # A tibble: 1,378 × 2
## household_id total_sales
## <chr> <dbl>
## 1 1 2416.
## 2 100 1337.
## 3 1000 2509.
## 4 1001 2559.
## 5 1003 1404.
## 6 1004 2487.
## 7 1005 1756.
## 8 101 4483.
## 9 1010 1734.
## 10 1011 1909.
## # ℹ 1,368 more rows
### Now, join the above results with the demographics data to determine:
### • How many of these households do we have demographic data on?
### • How many do we not have demographic on?
### • For those that we do have demographics on, which income range
### produces the most households that spend $1000 or more?
# How many of these households do we have demographic data on?
hshld_1000 %>%
inner_join(demographics, by = "household_id") %>%
tally()
## # A tibble: 1 × 1
## n
## <int>
## 1 742
# How many do we not have demographic on?
hshld_1000 %>%
anti_join(demographics, by = "household_id") %>%
tally()
## # A tibble: 1 × 1
## n
## <int>
## 1 636
# Which income range produces the most households that spend \$1000 or more?
hshld_1000 %>%
inner_join(demographics, by = "household_id") %>%
group_by(income) %>%
tally() %>%
arrange(desc(n))
## # A tibble: 12 × 2
## income n
## <ord> <int>
## 1 50-74K 185
## 2 35-49K 154
## 3 75-99K 88
## 4 25-34K 69
## 5 15-24K 67
## 6 Under 15K 58
## 7 125-149K 36
## 8 100-124K 33
## 9 150-174K 27
## 10 175-199K 10
## 11 250K+ 10
## 12 200-249K 5
# Question 4 ----------------------------------------------------------------------
### 4. Using the promotions and transactions data, compute the total sales
### for all products that were in a display in the front of the store
### (display_location = 1).
# join transactions and filtered promotions data
front_display_trans <- promotions %>%
filter(display_location == 1) %>%
inner_join(transactions, by = c('product_id', 'store_id', 'week'))
front_display_trans
## # A tibble: 11,134 × 13
## product_id store_id display_location mailer_location week household_id
## <chr> <chr> <fct> <fct> <int> <chr>
## 1 1043128 341 1 0 1 2327
## 2 1043128 341 1 0 1 2327
## 3 1053690 446 1 D 1 1431
## 4 1076875 32004 1 D 1 1419
## 5 12604552 335 1 0 1 717
## 6 5566748 310 1 A 1 1124
## 7 5566844 310 1 A 1 1124
## 8 5568378 361 1 D 1 1873
## 9 5568509 374 1 0 1 2222
## 10 6463504 370 1 D 1 1483
## # ℹ 11,124 more rows
## # ℹ 7 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## # retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>,
## # transaction_timestamp <dttm>
# total sales for all products displayed in the front of the store
front_display_trans %>%
summarize(total_sales = sum(sales_value, na.rm = TRUE))
## # A tibble: 1 × 1
## total_sales
## <dbl>
## 1 24506.
### Now compute the total sales for each product (product_id)
### displayed in the front of the store and identify
### the product_id that had the largest total sales.
# Identify the product displayed in the front of the store that had the
# largest total sales
front_display_trans %>%
group_by(product_id) %>%
summarize(total_front_display_sales = sum(sales_value, na.rm = TRUE)) %>%
arrange(desc(total_front_display_sales)) %>%
slice(1)
## # A tibble: 1 × 2
## product_id total_front_display_sales
## <chr> <dbl>
## 1 5569230 954.
front_display_trans %>%
group_by(product_id) %>%
summarize(total_front_display_sales = sum(sales_value, na.rm = TRUE)) %>%
arrange(desc(total_front_display_sales))
## # A tibble: 2,604 × 2
## product_id total_front_display_sales
## <chr> <dbl>
## 1 5569230 954.
## 2 5569471 534.
## 3 8090537 432.
## 4 8090521 396.
## 5 1065538 254.
## 6 5569845 214.
## 7 8090532 196.
## 8 878715 191.
## 9 837569 180.
## 10 5569374 172.
## # ℹ 2,594 more rows
# Question 5 ----------------------------------------------------------------------
### Fill in the blanks to identify which product_category
### is related to the coupon where campaign_id is
### equal to 18 and coupon_upc is equal to 10000089238?
coupons %>%
filter(campaign_id == 18, coupon_upc == 10000089238) %>%
inner_join(products, by = "product_id")
## # A tibble: 1 × 9
## coupon_upc product_id campaign_id manufacturer_id department brand
## <chr> <chr> <chr> <chr> <chr> <fct>
## 1 10000089238 5582264 18 4814 MEAT National
## # ℹ 3 more variables: product_category <chr>, product_type <chr>,
## # package_size <chr>
# Question 6 ----------------------------------------------------------------------
### Identify all different products that contain “pizza” in
### their product_type description. Which of these
### products produces the greatest amount of total sales
### (compute total sales by product ID and product type)?
pizza_sales <- products %>%
filter(str_detect(product_type, regex("pizza", ignore_case = TRUE))) %>%
inner_join(transactions, by = "product_id") %>%
group_by(product_id, product_type) %>%
summarize(total_sales = sum(sales_value, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(total_sales))
pizza_sales
## # A tibble: 394 × 3
## product_id product_type total_sales
## <chr> <chr> <dbl>
## 1 944139 PIZZA/TRADITIONAL 1344.
## 2 906838 PIZZA/PREMIUM 1264.
## 3 12648296 PIZZA/TRADITIONAL 1230.
## 4 969568 PIZZA/TRADITIONAL 1125.
## 5 925626 PIZZA/ECONOMY 1018.
## 6 1127624 PIZZA/ECONOMY 849
## 7 1021116 PIZZA/TRADITIONAL 783.
## 8 1009368 PIZZA/PREMIUM 718.
## 9 9524291 PREP FD: PIZZA (COLD) 717.
## 10 995628 PIZZA/TRADITIONAL 703.
## # ℹ 384 more rows
# Question 7 ----------------------------------------------------------------------
### Fill in the blanks to identify all products that are categorized
### (product_category) as “pizza” but are considered a “snack”
### or “appetizer” (via product_type). Hint: the simplest way
### to do this is to filter first for pizza products and then
### second for products that are snacks or appetizers.
relevant_products <- products %>%
filter(
str_detect(product_category, regex("pizza", ignore_case = TRUE)),
str_detect(product_type, regex("snack|appetizer", ignore_case = TRUE))
)
relevant_products
## # A tibble: 225 × 7
## product_id manufacturer_id department brand product_category product_type
## <chr> <chr> <chr> <fct> <chr> <chr>
## 1 28892 69 GROCERY Private FROZEN PIZZA SNACKS/APPET…
## 2 30002 69 GROCERY Private FROZEN PIZZA SNACKS/APPET…
## 3 42687 1873 GROCERY National FROZEN PIZZA SNACKS/APPET…
## 4 80730 1142 GROCERY National FROZEN PIZZA SNACKS/APPET…
## 5 81421 1423 GROCERY National FROZEN PIZZA SNACKS/APPET…
## 6 83722 1039 GROCERY National FROZEN PIZZA SNACKS/APPET…
## 7 85737 1142 GROCERY National FROZEN PIZZA SNACKS/APPET…
## 8 92490 1039 GROCERY National FROZEN PIZZA SNACKS/APPET…
## 9 92902 1142 GROCERY National FROZEN PIZZA SNACKS/APPET…
## 10 95165 1423 GROCERY National FROZEN PIZZA SNACKS/APPET…
## # ℹ 215 more rows
## # ℹ 1 more variable: package_size <chr>
### Now fill in the blanks to join the above relevant
### pizza products with the transactions data, compute the
### total quantity of items sold by product ID.
### Which of these products (product_id) have the most number of
### sales (which we are measuring by total quantity)?
relevant_products %>%
inner_join(transactions, by = 'product_id') %>%
group_by(product_id) %>%
summarize(total_qty = sum(quantity, na.rm = TRUE)) %>%
arrange(desc(total_qty))
## # A tibble: 188 × 2
## product_id total_qty
## <chr> <dbl>
## 1 845193 847
## 2 890695 718
## 3 907631 632
## 4 856252 479
## 5 1103105 276
## 6 1043494 241
## 7 999639 228
## 8 1035676 223
## 9 1123720 207
## 10 1054216 194
## # ℹ 178 more rows
### 1 845193 847
# Question 8 ----------------------------------------------------------------------
### Identify all products that contain “peanut butter” in their product_type.
### How many unique products does this result in?
pb <- products %>%
filter(str_detect(product_type, regex("peanut butter", ignore_case = TRUE)))
tally(pb)
## # A tibble: 1 × 1
## n
## <int>
## 1 144
### For these products, compute the total sales_value by
### month based on the transaction_timestamp. Which
### month produces the most sales value for these products?
### Which month produces the least sales value for these products?
library(lubridate)
pb %>%
inner_join(transactions, by = "product_id") %>%
group_by(month = month(transaction_timestamp, label = TRUE)) %>%
summarize(total_sales = sum(sales_value, na.rm = TRUE)) %>%
arrange(desc(total_sales))
## # A tibble: 12 × 2
## month total_sales
## <ord> <dbl>
## 1 Dec 1014.
## 2 Sep 903.
## 3 Aug 895.
## 4 Nov 886.
## 5 Oct 881.
## 6 Jun 872.
## 7 Jan 872.
## 8 Jul 858.
## 9 May 845.
## 10 Apr 794.
## 11 Feb 747.
## 12 Mar 735.
# Question 9 ----------------------------------------------------------------------
### Using the coupon_redemptions data, filter for the coupon
### associated with campaign_id 18 and coupon_upc “10000085475”.
### How many households redeemed this coupon? Now, using this coupon,
### identify the total sales_value for all transactions associated
### with the household_ids that redeemed this coupon on the same
### day they redeemed the coupon. To do this you will want to:
### a. filter coupon_redemptions data for campaign_id == “18”
### and coupon_upc == “10000085475”,
coupon_filtered <- coupon_redemptions %>%
filter(campaign_id == 18, coupon_upc == "10000085475")
coupon_filtered
## # A tibble: 63 × 4
## household_id coupon_upc campaign_id redemption_date
## <chr> <chr> <chr> <date>
## 1 1453 10000085475 18 2017-10-31
## 2 2282 10000085475 18 2017-11-01
## 3 2100 10000085475 18 2017-11-02
## 4 58 10000085475 18 2017-11-03
## 5 2200 10000085475 18 2017-11-03
## 6 2451 10000085475 18 2017-11-03
## 7 22 10000085475 18 2017-11-04
## 8 909 10000085475 18 2017-11-04
## 9 2168 10000085475 18 2017-11-04
## 10 438 10000085475 18 2017-11-05
## # ℹ 53 more rows
### b. join with the transactions data so that you only
### include households that redeemed the coupon,
redeemed_transactions <- coupon_filtered %>%
inner_join(transactions, by = "household_id")
redeemed_transactions
## # A tibble: 94,094 × 14
## household_id coupon_upc campaign_id redemption_date store_id basket_id
## <chr> <chr> <chr> <date> <chr> <chr>
## 1 1453 10000085475 18 2017-10-31 31862 31225377936
## 2 1453 10000085475 18 2017-10-31 31862 31225377936
## 3 1453 10000085475 18 2017-10-31 31862 31225377936
## 4 1453 10000085475 18 2017-10-31 31862 31242386043
## 5 1453 10000085475 18 2017-10-31 31862 31242387452
## 6 1453 10000085475 18 2017-10-31 31862 31242387452
## 7 1453 10000085475 18 2017-10-31 31862 31242387452
## 8 1453 10000085475 18 2017-10-31 31862 31242387458
## 9 1453 10000085475 18 2017-10-31 31862 31242387458
## 10 1453 10000085475 18 2017-10-31 31862 31242387458
## # ℹ 94,084 more rows
## # ℹ 8 more variables: product_id <chr>, quantity <dbl>, sales_value <dbl>,
## # retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>
### c. filter for those transactions where the redemption_date
### was made on the same day as the transaction_timestamp (hint: yday()), and
sameday_transactions <- redeemed_transactions %>%
filter(yday(redemption_date) == yday(transaction_timestamp))
sameday_transactions
## # A tibble: 2,503 × 14
## household_id coupon_upc campaign_id redemption_date store_id basket_id
## <chr> <chr> <chr> <date> <chr> <chr>
## 1 1453 10000085475 18 2017-10-31 31862 40556131423
## 2 1453 10000085475 18 2017-10-31 31862 40556131423
## 3 1453 10000085475 18 2017-10-31 31862 40556131423
## 4 1453 10000085475 18 2017-10-31 31862 40556131423
## 5 1453 10000085475 18 2017-10-31 31862 40556131423
## 6 1453 10000085475 18 2017-10-31 31862 40556131423
## 7 1453 10000085475 18 2017-10-31 31862 40556131423
## 8 1453 10000085475 18 2017-10-31 31862 40556131423
## 9 1453 10000085475 18 2017-10-31 31862 40556131423
## 10 1453 10000085475 18 2017-10-31 31862 40556131423
## # ℹ 2,493 more rows
## # ℹ 8 more variables: product_id <chr>, quantity <dbl>, sales_value <dbl>,
## # retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>
### d. then compute the total sales_value across all these transactions.
total_sales <- sameday_transactions %>%
summarize(total_sales_value = sum(sales_value, na.rm = TRUE))
total_sales
## # A tibble: 1 × 1
## total_sales_value
## <dbl>
## 1 7048.
num_households <- n_distinct(coupon_filtered$household_id)
num_households
## [1] 63
# Question 10 ---------------------------------------------------------------------
### 10. Let’s build onto #9. Using the same redeemed coupon
### (campaign_id == “18” & coupon_upc == “10000085475”).
### In this problem we are going to calculate the total
### sales_value for each product_type that this coupon
### was applied to so that we can identify which product_type resulted
### in the greatest sales when associated with this coupon.
### To do this you will want to:
### a. filter coupon_redemptions data for campaign_id == “18”
### and coupon_upc == “10000085475”,
coupon_filtered
## # A tibble: 63 × 4
## household_id coupon_upc campaign_id redemption_date
## <chr> <chr> <chr> <date>
## 1 1453 10000085475 18 2017-10-31
## 2 2282 10000085475 18 2017-11-01
## 3 2100 10000085475 18 2017-11-02
## 4 58 10000085475 18 2017-11-03
## 5 2200 10000085475 18 2017-11-03
## 6 2451 10000085475 18 2017-11-03
## 7 22 10000085475 18 2017-11-04
## 8 909 10000085475 18 2017-11-04
## 9 2168 10000085475 18 2017-11-04
## 10 438 10000085475 18 2017-11-05
## # ℹ 53 more rows
### b. perform an inner join this with the coupons data so that
### we only retain the coupon information for the relevant coupon.
### This step will provide us with the necessary product_id information
### so we can link the coupon to the products purchased.
products_coupon <- coupon_filtered %>%
inner_join(coupons, by = "coupon_upc")
## Warning in inner_join(., coupons, by = "coupon_upc"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 48409 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
products_coupon
## # A tibble: 270,522 × 6
## household_id coupon_upc campaign_id.x redemption_date product_id
## <chr> <chr> <chr> <date> <chr>
## 1 1453 10000085475 18 2017-10-31 1000205
## 2 1453 10000085475 18 2017-10-31 1000237
## 3 1453 10000085475 18 2017-10-31 1000285
## 4 1453 10000085475 18 2017-10-31 1000411
## 5 1453 10000085475 18 2017-10-31 1000830
## 6 1453 10000085475 18 2017-10-31 1001412
## 7 1453 10000085475 18 2017-10-31 1001618
## 8 1453 10000085475 18 2017-10-31 1001703
## 9 1453 10000085475 18 2017-10-31 1001775
## 10 1453 10000085475 18 2017-10-31 1001885
## # ℹ 270,512 more rows
## # ℹ 1 more variable: campaign_id.y <chr>
### c. perform an inner join with the products data so that we can get
### the product information for each product associated with the redeemed coupons.
products_info <- products_coupon %>%
inner_join(products, by = "product_id")
products_info
## # A tibble: 270,522 × 12
## household_id coupon_upc campaign_id.x redemption_date product_id
## <chr> <chr> <chr> <date> <chr>
## 1 1453 10000085475 18 2017-10-31 1000205
## 2 1453 10000085475 18 2017-10-31 1000237
## 3 1453 10000085475 18 2017-10-31 1000285
## 4 1453 10000085475 18 2017-10-31 1000411
## 5 1453 10000085475 18 2017-10-31 1000830
## 6 1453 10000085475 18 2017-10-31 1001412
## 7 1453 10000085475 18 2017-10-31 1001618
## 8 1453 10000085475 18 2017-10-31 1001703
## 9 1453 10000085475 18 2017-10-31 1001775
## 10 1453 10000085475 18 2017-10-31 1001885
## # ℹ 270,512 more rows
## # ℹ 7 more variables: campaign_id.y <chr>, manufacturer_id <chr>,
## # department <chr>, brand <fct>, product_category <chr>, product_type <chr>,
## # package_size <chr>
view(products_info)
### d. Filter for only those products where “vegetables” is in the
### ‘product_category’ description.
vegetable_products <- products_info %>%
filter(str_detect(product_category, regex("vegetables", ignore_case = TRUE)))
vegetable_products
## # A tibble: 40,446 × 12
## household_id coupon_upc campaign_id.x redemption_date product_id
## <chr> <chr> <chr> <date> <chr>
## 1 1453 10000085475 18 2017-10-31 1000205
## 2 1453 10000085475 18 2017-10-31 1001703
## 3 1453 10000085475 18 2017-10-31 1001885
## 4 1453 10000085475 18 2017-10-31 1004868
## 5 1453 10000085475 18 2017-10-31 1005456
## 6 1453 10000085475 18 2017-10-31 1006586
## 7 1453 10000085475 18 2017-10-31 1007796
## 8 1453 10000085475 18 2017-10-31 1009046
## 9 1453 10000085475 18 2017-10-31 1009959
## 10 1453 10000085475 18 2017-10-31 10119030
## # ℹ 40,436 more rows
## # ℹ 7 more variables: campaign_id.y <chr>, manufacturer_id <chr>,
## # department <chr>, brand <fct>, product_category <chr>, product_type <chr>,
## # package_size <chr>
### e. Now perform an inner join with the transactions data using the
### household_id and product_id keys.
transactions_vegetables <- vegetable_products %>%
inner_join(transactions, by = "product_id")
## Warning in inner_join(., transactions, by = "product_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 159837 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
transactions_vegetables
## # A tibble: 1,473,759 × 22
## household_id.x coupon_upc campaign_id.x redemption_date product_id
## <chr> <chr> <chr> <date> <chr>
## 1 1453 10000085475 18 2017-10-31 1000205
## 2 1453 10000085475 18 2017-10-31 1000205
## 3 1453 10000085475 18 2017-10-31 1000205
## 4 1453 10000085475 18 2017-10-31 1000205
## 5 1453 10000085475 18 2017-10-31 1000205
## 6 1453 10000085475 18 2017-10-31 1000205
## 7 1453 10000085475 18 2017-10-31 1000205
## 8 1453 10000085475 18 2017-10-31 1000205
## 9 1453 10000085475 18 2017-10-31 1000205
## 10 1453 10000085475 18 2017-10-31 1000205
## # ℹ 1,473,749 more rows
## # ℹ 17 more variables: campaign_id.y <chr>, manufacturer_id <chr>,
## # department <chr>, brand <fct>, product_category <chr>, product_type <chr>,
## # package_size <chr>, household_id.y <chr>, store_id <chr>, basket_id <chr>,
## # quantity <dbl>, sales_value <dbl>, retail_disc <dbl>, coupon_disc <dbl>,
## # coupon_match_disc <dbl>, week <int>, transaction_timestamp <dttm>
### f. Filter the data so that the day of year of the redemption_date is
### equal to the day of year of the transaction_timestamp (hint: yday()).
same_transactions <- transactions_vegetables %>%
filter(yday(redemption_date) == yday(transaction_timestamp))
same_transactions
## # A tibble: 4,645 × 22
## household_id.x coupon_upc campaign_id.x redemption_date product_id
## <chr> <chr> <chr> <date> <chr>
## 1 1453 10000085475 18 2017-10-31 1005456
## 2 1453 10000085475 18 2017-10-31 1018050
## 3 1453 10000085475 18 2017-10-31 1018050
## 4 1453 10000085475 18 2017-10-31 1036927
## 5 1453 10000085475 18 2017-10-31 1039682
## 6 1453 10000085475 18 2017-10-31 1051516
## 7 1453 10000085475 18 2017-10-31 1068715
## 8 1453 10000085475 18 2017-10-31 1119051
## 9 1453 10000085475 18 2017-10-31 12262748
## 10 1453 10000085475 18 2017-10-31 12731183
## # ℹ 4,635 more rows
## # ℹ 17 more variables: campaign_id.y <chr>, manufacturer_id <chr>,
## # department <chr>, brand <fct>, product_category <chr>, product_type <chr>,
## # package_size <chr>, household_id.y <chr>, store_id <chr>, basket_id <chr>,
## # quantity <dbl>, sales_value <dbl>, retail_disc <dbl>, coupon_disc <dbl>,
## # coupon_match_disc <dbl>, week <int>, transaction_timestamp <dttm>
### g. Now you can group by product_type,
product_type <- same_transactions %>%
group_by(product_type) %>%
summarize(total_sales_value = sum(sales_value, na.rm = TRUE))
product_type
## # A tibble: 43 × 2
## product_type total_sales_value
## <chr> <dbl>
## 1 ASPARAGUS 567.
## 2 BEANS 261.
## 3 BLUEBERRIES 3.84
## 4 BROCCOLI/CAULIFLOWER PROCESSED 41.3
## 5 CABBAGE 387.
## 6 CARROTS MINI PEELED 3.98
## 7 CARROTS SLICED/SHRED/STICKS 125.
## 8 CELERY 1084.
## 9 CELERY CHOPPED/STICKS 25.5
## 10 CUCUMBERS 737.
## # ℹ 33 more rows
### h. compute the total sales_value, and
sorted_sales <- product_type %>%
arrange(desc(total_sales_value))
sorted_sales
## # A tibble: 43 × 2
## product_type total_sales_value
## <chr> <dbl>
## 1 CELERY 1084.
## 2 HEAD LETTUCE 770.
## 3 CUCUMBERS 737.
## 4 VARIETY LETTUCE 594.
## 5 ASPARAGUS 567.
## 6 GREENS 530.
## 7 VEGETABLES ALL OTHER 461.
## 8 CABBAGE 387.
## 9 ORGANIC SALAD MIX 369.
## 10 ORGANIC HERBS 364.
## # ℹ 33 more rows
### i. arrange the data to identify the product_type with the
### largest total sales value.
sorted_sales
## # A tibble: 43 × 2
## product_type total_sales_value
## <chr> <dbl>
## 1 CELERY 1084.
## 2 HEAD LETTUCE 770.
## 3 CUCUMBERS 737.
## 4 VARIETY LETTUCE 594.
## 5 ASPARAGUS 567.
## 6 GREENS 530.
## 7 VEGETABLES ALL OTHER 461.
## 8 CABBAGE 387.
## 9 ORGANIC SALAD MIX 369.
## 10 ORGANIC HERBS 364.
## # ℹ 33 more rows