Module-Lab.-Quynh-Tran.R

### Module 4


library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(completejourney)

## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.

transactions <- get_transactions()
dim(transactions)

## [1] 1469307      11

## [1] 1469307 11
promotions <- get_promotions()
dim(promotions)

## [1] 20940529        5

## [1] 20940529 5


# Question 1 ----------------------------------------------------------------------

### Fill in the blanks with the correct join operations to answer the following questions. 
### Using the transactions and demographics data, 
### how many of the 1,469,307 transactions do we have demographic
### information for? How many of the transactions do we not have 
### demographic information on?


# how many transaction do we have demographics on?

transactions %>%
  inner_join(demographics, by = "household_id") %>%
  tally()

## # A tibble: 1 × 1
##        n
##    <int>
## 1 828850

# how many transaction do we NOT have demographics on?
transactions %>%
  anti_join(demographics, by = "household_id") %>%
  tally()

## # A tibble: 1 × 1
##        n
##    <int>
## 1 640457

# Question 2 ----------------------------------------------------------------------


## Fill in the blanks to perform an inner join 
## with the transactions and demographics data. Then,
## compute the total sales_value by age category to 
## identify which age group generates the most sales.

transactions %>%
  inner_join(demographics, by = "household_id") %>%
  group_by(age) %>%
  summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
  arrange(desc(total_sales))

## # A tibble: 6 × 2
##   age   total_sales
##   <ord>       <dbl>
## 1 45-54     971822.
## 2 35-44     724357.
## 3 25-34     453372.
## 4 65+       176601.
## 5 55-64     173154.
## 6 19-24     125673.

# Question 3 ----------------------------------------------------------------------

## Identify all households that have total sales (sales_value) of $1000 or more. 
## To do this, fill in the blanks to compute total sales by household ID 
## and then filter for those household IDs that have total_sales 
## equal to or greater than $1000.


# Identify households with $1000 or more in total sales

hshld_1000 <- transactions %>%
  group_by(household_id) %>%
  summarize(total_sales = sum(sales_value, na.rm = TRUE)) %>%
  filter(total_sales >= 1000)

hshld_1000

## # A tibble: 1,378 × 2
##    household_id total_sales
##    <chr>              <dbl>
##  1 1                  2416.
##  2 100                1337.
##  3 1000               2509.
##  4 1001               2559.
##  5 1003               1404.
##  6 1004               2487.
##  7 1005               1756.
##  8 101                4483.
##  9 1010               1734.
## 10 1011               1909.
## # ℹ 1,368 more rows

### Now, join the above results with the demographics data to determine:
###  • How many of these households do we have demographic data on?
###  • How many do we not have demographic on?
###  • For those that we do have demographics on, which income range 
###    produces the most households that spend $1000 or more?

# How many of these households do we have demographic data on?
hshld_1000 %>%
  inner_join(demographics, by = "household_id") %>%
  tally()

## # A tibble: 1 × 1
##       n
##   <int>
## 1   742

# How many do we not have demographic on?
hshld_1000 %>%
  anti_join(demographics, by = "household_id") %>%
  tally()

## # A tibble: 1 × 1
##       n
##   <int>
## 1   636

# Which income range produces the most households that spend \$1000 or more?
hshld_1000 %>%
  inner_join(demographics, by = "household_id") %>%
  group_by(income) %>%
  tally() %>%
  arrange(desc(n))

## # A tibble: 12 × 2
##    income        n
##    <ord>     <int>
##  1 50-74K      185
##  2 35-49K      154
##  3 75-99K       88
##  4 25-34K       69
##  5 15-24K       67
##  6 Under 15K    58
##  7 125-149K     36
##  8 100-124K     33
##  9 150-174K     27
## 10 175-199K     10
## 11 250K+        10
## 12 200-249K      5

# Question 4 ----------------------------------------------------------------------

### 4. Using the promotions and transactions data, compute the total sales 
### for all products that were in a display in the front of the store 
### (display_location = 1).

# join transactions and filtered promotions data

front_display_trans <- promotions %>%
  filter(display_location == 1) %>%
  inner_join(transactions, by = c('product_id', 'store_id', 'week'))

front_display_trans

## # A tibble: 11,134 × 13
##    product_id store_id display_location mailer_location  week household_id
##    <chr>      <chr>    <fct>            <fct>           <int> <chr>       
##  1 1043128    341      1                0                   1 2327        
##  2 1043128    341      1                0                   1 2327        
##  3 1053690    446      1                D                   1 1431        
##  4 1076875    32004    1                D                   1 1419        
##  5 12604552   335      1                0                   1 717         
##  6 5566748    310      1                A                   1 1124        
##  7 5566844    310      1                A                   1 1124        
##  8 5568378    361      1                D                   1 1873        
##  9 5568509    374      1                0                   1 2222        
## 10 6463504    370      1                D                   1 1483        
## # ℹ 11,124 more rows
## # ℹ 7 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## #   retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>,
## #   transaction_timestamp <dttm>

# total sales for all products displayed in the front of the store
front_display_trans %>%
  summarize(total_sales = sum(sales_value, na.rm = TRUE))

## # A tibble: 1 × 1
##   total_sales
##         <dbl>
## 1      24506.

### Now compute the total sales for each product (product_id) 
### displayed in the front of the store and identify
### the product_id that had the largest total sales.


# Identify the product displayed in the front of the store that had the
# largest total sales

front_display_trans %>%
  group_by(product_id) %>%
  summarize(total_front_display_sales = sum(sales_value, na.rm = TRUE)) %>%
  arrange(desc(total_front_display_sales)) %>%
  slice(1)

## # A tibble: 1 × 2
##   product_id total_front_display_sales
##   <chr>                          <dbl>
## 1 5569230                         954.

front_display_trans %>%
  group_by(product_id) %>%
  summarize(total_front_display_sales = sum(sales_value, na.rm = TRUE)) %>%
  arrange(desc(total_front_display_sales))

## # A tibble: 2,604 × 2
##    product_id total_front_display_sales
##    <chr>                          <dbl>
##  1 5569230                         954.
##  2 5569471                         534.
##  3 8090537                         432.
##  4 8090521                         396.
##  5 1065538                         254.
##  6 5569845                         214.
##  7 8090532                         196.
##  8 878715                          191.
##  9 837569                          180.
## 10 5569374                         172.
## # ℹ 2,594 more rows

# Question 5 ----------------------------------------------------------------------


### Fill in the blanks to identify which product_category 
### is related to the coupon where campaign_id is
### equal to 18 and coupon_upc is equal to 10000089238?


coupons %>%
  filter(campaign_id == 18, coupon_upc == 10000089238) %>%
  inner_join(products, by = "product_id")

## # A tibble: 1 × 9
##   coupon_upc  product_id campaign_id manufacturer_id department brand   
##   <chr>       <chr>      <chr>       <chr>           <chr>      <fct>   
## 1 10000089238 5582264    18          4814            MEAT       National
## # ℹ 3 more variables: product_category <chr>, product_type <chr>,
## #   package_size <chr>

# Question 6 ----------------------------------------------------------------------

### Identify all different products that contain “pizza” in 
### their product_type description. Which of these
### products produces the greatest amount of total sales 
### (compute total sales by product ID and product type)?


pizza_sales <- products %>%
  filter(str_detect(product_type, regex("pizza", ignore_case = TRUE))) %>%
  inner_join(transactions, by = "product_id") %>%
  group_by(product_id, product_type) %>%
  summarize(total_sales = sum(sales_value, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(total_sales))

pizza_sales

## # A tibble: 394 × 3
##    product_id product_type          total_sales
##    <chr>      <chr>                       <dbl>
##  1 944139     PIZZA/TRADITIONAL           1344.
##  2 906838     PIZZA/PREMIUM               1264.
##  3 12648296   PIZZA/TRADITIONAL           1230.
##  4 969568     PIZZA/TRADITIONAL           1125.
##  5 925626     PIZZA/ECONOMY               1018.
##  6 1127624    PIZZA/ECONOMY                849 
##  7 1021116    PIZZA/TRADITIONAL            783.
##  8 1009368    PIZZA/PREMIUM                718.
##  9 9524291    PREP FD: PIZZA (COLD)        717.
## 10 995628     PIZZA/TRADITIONAL            703.
## # ℹ 384 more rows

# Question 7 ----------------------------------------------------------------------

### Fill in the blanks to identify all products that are categorized 
### (product_category) as “pizza” but are considered a “snack” 
### or “appetizer” (via product_type). Hint: the simplest way 
### to do this is to filter first for pizza products and then 
### second for products that are snacks or appetizers.


relevant_products <- products %>%
  filter(
    str_detect(product_category, regex("pizza", ignore_case = TRUE)),  
    str_detect(product_type, regex("snack|appetizer", ignore_case = TRUE))
)

relevant_products

## # A tibble: 225 × 7
##    product_id manufacturer_id department brand    product_category product_type 
##    <chr>      <chr>           <chr>      <fct>    <chr>            <chr>        
##  1 28892      69              GROCERY    Private  FROZEN PIZZA     SNACKS/APPET…
##  2 30002      69              GROCERY    Private  FROZEN PIZZA     SNACKS/APPET…
##  3 42687      1873            GROCERY    National FROZEN PIZZA     SNACKS/APPET…
##  4 80730      1142            GROCERY    National FROZEN PIZZA     SNACKS/APPET…
##  5 81421      1423            GROCERY    National FROZEN PIZZA     SNACKS/APPET…
##  6 83722      1039            GROCERY    National FROZEN PIZZA     SNACKS/APPET…
##  7 85737      1142            GROCERY    National FROZEN PIZZA     SNACKS/APPET…
##  8 92490      1039            GROCERY    National FROZEN PIZZA     SNACKS/APPET…
##  9 92902      1142            GROCERY    National FROZEN PIZZA     SNACKS/APPET…
## 10 95165      1423            GROCERY    National FROZEN PIZZA     SNACKS/APPET…
## # ℹ 215 more rows
## # ℹ 1 more variable: package_size <chr>

### Now fill in the blanks to join the above relevant 
### pizza products with the transactions data, compute the
### total quantity of items sold by product ID. 
### Which of these products (product_id) have the most number of
### sales (which we are measuring by total quantity)?


relevant_products %>%
  inner_join(transactions, by = 'product_id') %>%
  group_by(product_id) %>%
  summarize(total_qty = sum(quantity, na.rm = TRUE)) %>%
  arrange(desc(total_qty))

## # A tibble: 188 × 2
##    product_id total_qty
##    <chr>          <dbl>
##  1 845193           847
##  2 890695           718
##  3 907631           632
##  4 856252           479
##  5 1103105          276
##  6 1043494          241
##  7 999639           228
##  8 1035676          223
##  9 1123720          207
## 10 1054216          194
## # ℹ 178 more rows

  ###  1 845193           847




# Question 8 ----------------------------------------------------------------------


### Identify all products that contain “peanut butter” in their product_type. 
### How many unique products does this result in?
  
  
pb <- products %>%
  filter(str_detect(product_type, regex("peanut butter", ignore_case = TRUE)))

tally(pb)

## # A tibble: 1 × 1
##       n
##   <int>
## 1   144

### For these products, compute the total sales_value by 
### month based on the transaction_timestamp. Which
### month produces the most sales value for these products? 
### Which month produces the least sales value for these products?


library(lubridate)

pb %>%
  inner_join(transactions, by = "product_id") %>%
  group_by(month = month(transaction_timestamp, label = TRUE)) %>% 
  summarize(total_sales = sum(sales_value, na.rm = TRUE)) %>% 
  arrange(desc(total_sales))

## # A tibble: 12 × 2
##    month total_sales
##    <ord>       <dbl>
##  1 Dec         1014.
##  2 Sep          903.
##  3 Aug          895.
##  4 Nov          886.
##  5 Oct          881.
##  6 Jun          872.
##  7 Jan          872.
##  8 Jul          858.
##  9 May          845.
## 10 Apr          794.
## 11 Feb          747.
## 12 Mar          735.

# Question 9 ----------------------------------------------------------------------


### Using the coupon_redemptions data, filter for the coupon 
### associated with campaign_id 18 and coupon_upc “10000085475”. 
### How many households redeemed this coupon? Now, using this coupon,
### identify the total sales_value for all transactions associated 
### with the household_ids that redeemed this coupon on the same 
### day they redeemed the coupon. To do this you will want to:

### a. filter coupon_redemptions data for campaign_id == “18” 
###    and coupon_upc == “10000085475”,

coupon_filtered <- coupon_redemptions %>%
  filter(campaign_id == 18, coupon_upc == "10000085475")
coupon_filtered

## # A tibble: 63 × 4
##    household_id coupon_upc  campaign_id redemption_date
##    <chr>        <chr>       <chr>       <date>         
##  1 1453         10000085475 18          2017-10-31     
##  2 2282         10000085475 18          2017-11-01     
##  3 2100         10000085475 18          2017-11-02     
##  4 58           10000085475 18          2017-11-03     
##  5 2200         10000085475 18          2017-11-03     
##  6 2451         10000085475 18          2017-11-03     
##  7 22           10000085475 18          2017-11-04     
##  8 909          10000085475 18          2017-11-04     
##  9 2168         10000085475 18          2017-11-04     
## 10 438          10000085475 18          2017-11-05     
## # ℹ 53 more rows

### b. join with the transactions data so that you only 
###    include households that redeemed the coupon,

redeemed_transactions <- coupon_filtered %>%
  inner_join(transactions, by = "household_id")
redeemed_transactions

## # A tibble: 94,094 × 14
##    household_id coupon_upc  campaign_id redemption_date store_id basket_id  
##    <chr>        <chr>       <chr>       <date>          <chr>    <chr>      
##  1 1453         10000085475 18          2017-10-31      31862    31225377936
##  2 1453         10000085475 18          2017-10-31      31862    31225377936
##  3 1453         10000085475 18          2017-10-31      31862    31225377936
##  4 1453         10000085475 18          2017-10-31      31862    31242386043
##  5 1453         10000085475 18          2017-10-31      31862    31242387452
##  6 1453         10000085475 18          2017-10-31      31862    31242387452
##  7 1453         10000085475 18          2017-10-31      31862    31242387452
##  8 1453         10000085475 18          2017-10-31      31862    31242387458
##  9 1453         10000085475 18          2017-10-31      31862    31242387458
## 10 1453         10000085475 18          2017-10-31      31862    31242387458
## # ℹ 94,084 more rows
## # ℹ 8 more variables: product_id <chr>, quantity <dbl>, sales_value <dbl>,
## #   retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## #   transaction_timestamp <dttm>

### c. filter for those transactions where the redemption_date 
###    was made on the same day as the transaction_timestamp (hint: yday()), and

sameday_transactions <- redeemed_transactions %>%
  filter(yday(redemption_date) == yday(transaction_timestamp))
sameday_transactions

## # A tibble: 2,503 × 14
##    household_id coupon_upc  campaign_id redemption_date store_id basket_id  
##    <chr>        <chr>       <chr>       <date>          <chr>    <chr>      
##  1 1453         10000085475 18          2017-10-31      31862    40556131423
##  2 1453         10000085475 18          2017-10-31      31862    40556131423
##  3 1453         10000085475 18          2017-10-31      31862    40556131423
##  4 1453         10000085475 18          2017-10-31      31862    40556131423
##  5 1453         10000085475 18          2017-10-31      31862    40556131423
##  6 1453         10000085475 18          2017-10-31      31862    40556131423
##  7 1453         10000085475 18          2017-10-31      31862    40556131423
##  8 1453         10000085475 18          2017-10-31      31862    40556131423
##  9 1453         10000085475 18          2017-10-31      31862    40556131423
## 10 1453         10000085475 18          2017-10-31      31862    40556131423
## # ℹ 2,493 more rows
## # ℹ 8 more variables: product_id <chr>, quantity <dbl>, sales_value <dbl>,
## #   retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## #   transaction_timestamp <dttm>

### d. then compute the total sales_value across all these transactions.

total_sales <- sameday_transactions %>%
  summarize(total_sales_value = sum(sales_value, na.rm = TRUE))
total_sales

## # A tibble: 1 × 1
##   total_sales_value
##               <dbl>
## 1             7048.

num_households <- n_distinct(coupon_filtered$household_id)
num_households

## [1] 63

# Question 10 ---------------------------------------------------------------------


### 10. Let’s build onto #9. Using the same redeemed coupon 
### (campaign_id == “18” & coupon_upc == “10000085475”). 
### In this problem we are going to calculate the total 
### sales_value for each product_type that this coupon 
### was applied to so that we can identify which product_type resulted
### in the greatest sales when associated with this coupon.

### To do this you will want to:

### a. filter coupon_redemptions data for campaign_id == “18” 
### and coupon_upc == “10000085475”,

coupon_filtered

## # A tibble: 63 × 4
##    household_id coupon_upc  campaign_id redemption_date
##    <chr>        <chr>       <chr>       <date>         
##  1 1453         10000085475 18          2017-10-31     
##  2 2282         10000085475 18          2017-11-01     
##  3 2100         10000085475 18          2017-11-02     
##  4 58           10000085475 18          2017-11-03     
##  5 2200         10000085475 18          2017-11-03     
##  6 2451         10000085475 18          2017-11-03     
##  7 22           10000085475 18          2017-11-04     
##  8 909          10000085475 18          2017-11-04     
##  9 2168         10000085475 18          2017-11-04     
## 10 438          10000085475 18          2017-11-05     
## # ℹ 53 more rows

### b. perform an inner join this with the coupons data so that 
### we only retain the coupon information for the relevant coupon. 
### This step will provide us with the necessary product_id information 
### so we can link the coupon to the products purchased.

products_coupon <- coupon_filtered %>%
  inner_join(coupons, by = "coupon_upc")

## Warning in inner_join(., coupons, by = "coupon_upc"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 48409 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

products_coupon

## # A tibble: 270,522 × 6
##    household_id coupon_upc  campaign_id.x redemption_date product_id
##    <chr>        <chr>       <chr>         <date>          <chr>     
##  1 1453         10000085475 18            2017-10-31      1000205   
##  2 1453         10000085475 18            2017-10-31      1000237   
##  3 1453         10000085475 18            2017-10-31      1000285   
##  4 1453         10000085475 18            2017-10-31      1000411   
##  5 1453         10000085475 18            2017-10-31      1000830   
##  6 1453         10000085475 18            2017-10-31      1001412   
##  7 1453         10000085475 18            2017-10-31      1001618   
##  8 1453         10000085475 18            2017-10-31      1001703   
##  9 1453         10000085475 18            2017-10-31      1001775   
## 10 1453         10000085475 18            2017-10-31      1001885   
## # ℹ 270,512 more rows
## # ℹ 1 more variable: campaign_id.y <chr>

### c. perform an inner join with the products data so that we can get 
### the product information for each product associated with the redeemed coupons.

products_info <- products_coupon %>%
  inner_join(products, by = "product_id")
products_info

## # A tibble: 270,522 × 12
##    household_id coupon_upc  campaign_id.x redemption_date product_id
##    <chr>        <chr>       <chr>         <date>          <chr>     
##  1 1453         10000085475 18            2017-10-31      1000205   
##  2 1453         10000085475 18            2017-10-31      1000237   
##  3 1453         10000085475 18            2017-10-31      1000285   
##  4 1453         10000085475 18            2017-10-31      1000411   
##  5 1453         10000085475 18            2017-10-31      1000830   
##  6 1453         10000085475 18            2017-10-31      1001412   
##  7 1453         10000085475 18            2017-10-31      1001618   
##  8 1453         10000085475 18            2017-10-31      1001703   
##  9 1453         10000085475 18            2017-10-31      1001775   
## 10 1453         10000085475 18            2017-10-31      1001885   
## # ℹ 270,512 more rows
## # ℹ 7 more variables: campaign_id.y <chr>, manufacturer_id <chr>,
## #   department <chr>, brand <fct>, product_category <chr>, product_type <chr>,
## #   package_size <chr>

view(products_info)

### d. Filter for only those products where “vegetables” is in the 
### ‘product_category’ description.

vegetable_products <- products_info %>%
  filter(str_detect(product_category, regex("vegetables", ignore_case = TRUE)))
vegetable_products

## # A tibble: 40,446 × 12
##    household_id coupon_upc  campaign_id.x redemption_date product_id
##    <chr>        <chr>       <chr>         <date>          <chr>     
##  1 1453         10000085475 18            2017-10-31      1000205   
##  2 1453         10000085475 18            2017-10-31      1001703   
##  3 1453         10000085475 18            2017-10-31      1001885   
##  4 1453         10000085475 18            2017-10-31      1004868   
##  5 1453         10000085475 18            2017-10-31      1005456   
##  6 1453         10000085475 18            2017-10-31      1006586   
##  7 1453         10000085475 18            2017-10-31      1007796   
##  8 1453         10000085475 18            2017-10-31      1009046   
##  9 1453         10000085475 18            2017-10-31      1009959   
## 10 1453         10000085475 18            2017-10-31      10119030  
## # ℹ 40,436 more rows
## # ℹ 7 more variables: campaign_id.y <chr>, manufacturer_id <chr>,
## #   department <chr>, brand <fct>, product_category <chr>, product_type <chr>,
## #   package_size <chr>

### e. Now perform an inner join with the transactions data using the 
### household_id and product_id keys.

transactions_vegetables <- vegetable_products %>%
  inner_join(transactions, by = "product_id")

## Warning in inner_join(., transactions, by = "product_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 159837 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

transactions_vegetables

## # A tibble: 1,473,759 × 22
##    household_id.x coupon_upc  campaign_id.x redemption_date product_id
##    <chr>          <chr>       <chr>         <date>          <chr>     
##  1 1453           10000085475 18            2017-10-31      1000205   
##  2 1453           10000085475 18            2017-10-31      1000205   
##  3 1453           10000085475 18            2017-10-31      1000205   
##  4 1453           10000085475 18            2017-10-31      1000205   
##  5 1453           10000085475 18            2017-10-31      1000205   
##  6 1453           10000085475 18            2017-10-31      1000205   
##  7 1453           10000085475 18            2017-10-31      1000205   
##  8 1453           10000085475 18            2017-10-31      1000205   
##  9 1453           10000085475 18            2017-10-31      1000205   
## 10 1453           10000085475 18            2017-10-31      1000205   
## # ℹ 1,473,749 more rows
## # ℹ 17 more variables: campaign_id.y <chr>, manufacturer_id <chr>,
## #   department <chr>, brand <fct>, product_category <chr>, product_type <chr>,
## #   package_size <chr>, household_id.y <chr>, store_id <chr>, basket_id <chr>,
## #   quantity <dbl>, sales_value <dbl>, retail_disc <dbl>, coupon_disc <dbl>,
## #   coupon_match_disc <dbl>, week <int>, transaction_timestamp <dttm>

### f. Filter the data so that the day of year of the redemption_date is 
### equal to the day of year of the transaction_timestamp (hint: yday()).

same_transactions <- transactions_vegetables %>%
  filter(yday(redemption_date) == yday(transaction_timestamp))
same_transactions

## # A tibble: 4,645 × 22
##    household_id.x coupon_upc  campaign_id.x redemption_date product_id
##    <chr>          <chr>       <chr>         <date>          <chr>     
##  1 1453           10000085475 18            2017-10-31      1005456   
##  2 1453           10000085475 18            2017-10-31      1018050   
##  3 1453           10000085475 18            2017-10-31      1018050   
##  4 1453           10000085475 18            2017-10-31      1036927   
##  5 1453           10000085475 18            2017-10-31      1039682   
##  6 1453           10000085475 18            2017-10-31      1051516   
##  7 1453           10000085475 18            2017-10-31      1068715   
##  8 1453           10000085475 18            2017-10-31      1119051   
##  9 1453           10000085475 18            2017-10-31      12262748  
## 10 1453           10000085475 18            2017-10-31      12731183  
## # ℹ 4,635 more rows
## # ℹ 17 more variables: campaign_id.y <chr>, manufacturer_id <chr>,
## #   department <chr>, brand <fct>, product_category <chr>, product_type <chr>,
## #   package_size <chr>, household_id.y <chr>, store_id <chr>, basket_id <chr>,
## #   quantity <dbl>, sales_value <dbl>, retail_disc <dbl>, coupon_disc <dbl>,
## #   coupon_match_disc <dbl>, week <int>, transaction_timestamp <dttm>

### g. Now you can group by product_type,

product_type <- same_transactions %>%
  group_by(product_type) %>%
  summarize(total_sales_value = sum(sales_value, na.rm = TRUE))
product_type

## # A tibble: 43 × 2
##    product_type                   total_sales_value
##    <chr>                                      <dbl>
##  1 ASPARAGUS                                 567.  
##  2 BEANS                                     261.  
##  3 BLUEBERRIES                                 3.84
##  4 BROCCOLI/CAULIFLOWER PROCESSED             41.3 
##  5 CABBAGE                                   387.  
##  6 CARROTS MINI PEELED                         3.98
##  7 CARROTS SLICED/SHRED/STICKS               125.  
##  8 CELERY                                   1084.  
##  9 CELERY CHOPPED/STICKS                      25.5 
## 10 CUCUMBERS                                 737.  
## # ℹ 33 more rows

### h. compute the total sales_value, and

sorted_sales <- product_type %>%
  arrange(desc(total_sales_value))
sorted_sales

## # A tibble: 43 × 2
##    product_type         total_sales_value
##    <chr>                            <dbl>
##  1 CELERY                           1084.
##  2 HEAD LETTUCE                      770.
##  3 CUCUMBERS                         737.
##  4 VARIETY LETTUCE                   594.
##  5 ASPARAGUS                         567.
##  6 GREENS                            530.
##  7 VEGETABLES ALL OTHER              461.
##  8 CABBAGE                           387.
##  9 ORGANIC SALAD MIX                 369.
## 10 ORGANIC HERBS                     364.
## # ℹ 33 more rows

### i. arrange the data to identify the product_type with the 
### largest total sales value.

sorted_sales

## # A tibble: 43 × 2
##    product_type         total_sales_value
##    <chr>                            <dbl>
##  1 CELERY                           1084.
##  2 HEAD LETTUCE                      770.
##  3 CUCUMBERS                         737.
##  4 VARIETY LETTUCE                   594.
##  5 ASPARAGUS                         567.
##  6 GREENS                            530.
##  7 VEGETABLES ALL OTHER              461.
##  8 CABBAGE                           387.
##  9 ORGANIC SALAD MIX                 369.
## 10 ORGANIC HERBS                     364.
## # ℹ 33 more rows

Module-Lab.-Quynh-Tran.R

tranq

2025-02-09