library(tidyverse)
library(lubridate)
library(completejourney)
transactions <- get_transactions()
promotions <- get_promotions()
demographics <- completejourney::demographics
products <- completejourney::products
glimpse(transactions)
## Rows: 1,469,307
## Columns: 11
## $ household_id          <chr> "900", "900", "1228", "906", "906", "906", "906"…
## $ store_id              <chr> "330", "330", "406", "319", "319", "319", "319",…
## $ basket_id             <chr> "31198570044", "31198570047", "31198655051", "31…
## $ product_id            <chr> "1095275", "9878513", "1041453", "1020156", "105…
## $ quantity              <dbl> 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ sales_value           <dbl> 0.50, 0.99, 1.43, 1.50, 2.78, 5.49, 1.50, 1.88, …
## $ retail_disc           <dbl> 0.00, 0.10, 0.15, 0.29, 0.80, 0.50, 0.29, 0.21, …
## $ coupon_disc           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coupon_match_disc     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ week                  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ transaction_timestamp <dttm> 2017-01-01 06:53:26, 2017-01-01 07:10:28, 2017-…
glimpse(promotions)
## Rows: 20,940,529
## Columns: 5
## $ product_id       <chr> "1000050", "1000050", "1000050", "1000092", "1000092"…
## $ store_id         <chr> "316", "337", "441", "292", "293", "295", "298", "299…
## $ display_location <fct> 9, 3, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ mailer_location  <fct> 0, 0, 0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,…
## $ week             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
glimpse(products)
## Rows: 92,331
## Columns: 7
## $ product_id       <chr> "25671", "26081", "26093", "26190", "26355", "26426",…
## $ manufacturer_id  <chr> "2", "2", "69", "69", "69", "69", "69", "69", "69", "…
## $ department       <chr> "GROCERY", "MISCELLANEOUS", "PASTRY", "GROCERY", "GRO…
## $ brand            <fct> National, National, Private, Private, Private, Privat…
## $ product_category <chr> "FRZN ICE", NA, "BREAD", "FRUIT - SHELF STABLE", "COO…
## $ product_type     <chr> "ICE - CRUSHED/CUBED", NA, "BREAD:ITALIAN/FRENCH", "A…
## $ package_size     <chr> "22 LB", NA, NA, "50 OZ", "14 OZ", "2.5 OZ", "16 OZ",…
glimpse(demographics)
## Rows: 801
## Columns: 8
## $ household_id   <chr> "1", "1001", "1003", "1004", "101", "1012", "1014", "10…
## $ age            <ord> 65+, 45-54, 35-44, 25-34, 45-54, 35-44, 45-54, 45-54, 4…
## $ income         <ord> 35-49K, 50-74K, 25-34K, 15-24K, Under 15K, 35-49K, 15-2…
## $ home_ownership <ord> Homeowner, Homeowner, NA, NA, Homeowner, NA, NA, Homeow…
## $ marital_status <ord> Married, Unmarried, Unmarried, Unmarried, Married, Marr…
## $ household_size <ord> 2, 1, 1, 1, 4, 5+, 4, 1, 5+, 2, 5+, 4, 2, 1, 5+, 1, 1, …
## $ household_comp <ord> 2 Adults No Kids, 1 Adult No Kids, 1 Adult No Kids, 1 A…
## $ kids_count     <ord> 0, 0, 0, 0, 2, 3+, 2, 0, 3+, 0, 3+, 2, 0, 0, 3+, 0, 0, …
names(transactions)
##  [1] "household_id"          "store_id"              "basket_id"            
##  [4] "product_id"            "quantity"              "sales_value"          
##  [7] "retail_disc"           "coupon_disc"           "coupon_match_disc"    
## [10] "week"                  "transaction_timestamp"
names(demographics)
## [1] "household_id"   "age"            "income"         "home_ownership"
## [5] "marital_status" "household_size" "household_comp" "kids_count"
trans_demo <- transactions %>%
  left_join(demographics, by = "household_id")

dim(trans_demo)
## [1] 1469307      18

After having joined the transactions with demographics by household_id, new dataset has 1,469,307 rows and 18 columns.

trans_demo_prod <- trans_demo %>%
  left_join(products, by = "product_id")

dim(trans_demo_prod)
## [1] 1469307      24

After having joined in products id, this dataset still has 1,469,307 rows but now has 24 columns.

final_data <- trans_demo_prod %>%
  left_join(promotions, by = c("product_id", "store_id", "week"))
## Warning in left_join(., promotions, by = c("product_id", "store_id", "week")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 25 of `x` matches multiple rows in `y`.
## ℹ Row 352741 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
dim(final_data)
## [1] 1470002      26

After having joined promotions, the rows increased to 1470002 and 26 columns.

trans_with_demo <- transactions %>%
  inner_join(demographics, by = "household_id")

dim(trans_with_demo)
## [1] 828850     18
trans_without_demo <- transactions %>%
  anti_join(demographics, by = "household_id")

dim(trans_without_demo)
## [1] 640457     11

Out of 1,469,307 total transactions, 828,850 have demographic information, while 640,457 transactions do not.

households_with_demo <- transactions %>%
  semi_join(demographics, by = "household_id") %>%
  distinct(household_id)

nrow(households_with_demo)
## [1] 801

There are 801 unique households with demographic info

households_without_demo <- transactions %>%
  anti_join(demographics, by = "household_id") %>%
  distinct(household_id)

nrow(households_without_demo)
## [1] 1668

There are 1,668 unique households without demographic info

household_spend <- transactions %>%
  inner_join(demographics, by = "household_id") %>%
  group_by(household_id, income) %>%
  summarise(total_spend = sum(sales_value), .groups = "drop")
high_spend_households <- household_spend %>%
  filter(total_spend >= 1000)
high_spend_households %>%
  count(income) %>%
  arrange(desc(n))
## # A tibble: 12 × 2
##    income        n
##    <ord>     <int>
##  1 50-74K      185
##  2 35-49K      154
##  3 75-99K       88
##  4 25-34K       69
##  5 15-24K       67
##  6 Under 15K    58
##  7 125-149K     36
##  8 100-124K     33
##  9 150-174K     27
## 10 175-199K     10
## 11 250K+        10
## 12 200-249K      5

The income range with most household spending of 1,000 or more is 50-74k with 185 households.

n_distinct(transactions$household_id)
## [1] 2469

there are 2469 unique households in transactions data.

library(dplyr)
library(completejourney)

transactions <- get_transactions()
promotions   <- get_promotions()

dim(transactions)  
## [1] 1469307      11
dim(promotions)    
## [1] 20940529        5
transactions <- transactions %>%
  mutate(
    product_id = as.integer(product_id),
    store_id   = as.integer(store_id),
    week       = as.integer(week)
  )

promotions <- promotions %>%
  mutate(
    product_id = as.integer(product_id),
    store_id   = as.integer(store_id),
    week       = as.integer(week)
  )

front_display_trans <- promotions %>%
  filter(display_location == 1) %>%
  distinct(product_id, store_id, week) %>%
  inner_join(transactions, by = c("product_id", "store_id", "week"))

q4_results <- front_display_trans %>%
  group_by(product_id) %>%
  summarise(total_front_display_sales = sum(sales_value, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(total_front_display_sales))

q4_results %>% slice(1)
## # A tibble: 1 × 2
##   product_id total_front_display_sales
##        <int>                     <dbl>
## 1    5569230                      954.
dim(transactions); dim(promotions)
## [1] 1469307      11
## [1] 20940529        5
sapply(transactions[c("product_id","store_id","week")], class)
## product_id   store_id       week 
##  "integer"  "integer"  "integer"
nrow(front_display_trans)
## [1] 11134
with_demo <- transactions %>%
  inner_join(demographics, by = "household_id") %>%
  tally()

without_demo <- transactions %>%
  anti_join(demographics, by = "household_id") %>%
  tally()

with_demo
## # A tibble: 1 × 1
##        n
##    <int>
## 1 828850
without_demo
## # A tibble: 1 × 1
##        n
##    <int>
## 1 640457
q2_age_sales <- transactions %>%
  inner_join(demographics, by = "household_id") %>%
  group_by(age) %>%
  summarise(total_sales = sum(sales_value, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(total_sales))

q2_age_sales
## # A tibble: 6 × 2
##   age   total_sales
##   <ord>       <dbl>
## 1 45-54     971822.
## 2 35-44     724357.
## 3 25-34     453372.
## 4 65+       176601.
## 5 55-64     173154.
## 6 19-24     125673.
q2_age_sales %>% slice(1)
## # A tibble: 1 × 2
##   age   total_sales
##   <ord>       <dbl>
## 1 45-54     971822.
hshld_1000 <- transactions %>%
  group_by(household_id) %>%
  summarise(total_sales = sum(sales_value, na.rm = TRUE), .groups = "drop") %>%
  filter(total_sales >= 1000)

hshld_1000 %>%
  inner_join(demographics, by = "household_id") %>%
  tally()
## # A tibble: 1 × 1
##       n
##   <int>
## 1   742
q3_income_counts <- hshld_1000 %>%
  inner_join(demographics, by = "household_id") %>%
  count(income, sort = TRUE)

q3_income_counts
## # A tibble: 12 × 2
##    income        n
##    <ord>     <int>
##  1 50-74K      185
##  2 35-49K      154
##  3 75-99K       88
##  4 25-34K       69
##  5 15-24K       67
##  6 Under 15K    58
##  7 125-149K     36
##  8 100-124K     33
##  9 150-174K     27
## 10 175-199K     10
## 11 250K+        10
## 12 200-249K      5
q3_income_counts %>% slice(1)
## # A tibble: 1 × 2
##   income     n
##   <ord>  <int>
## 1 50-74K   185
front_products <- promotions %>%
  filter(display_location == 1) %>%
  distinct(product_id)

q4_results <- transactions %>%
  semi_join(front_products, by = "product_id") %>%
  group_by(product_id) %>%
  summarise(
    total_sales = sum(sales_value * quantity, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(total_sales))

q4_results %>% slice(1)
## # A tibble: 1 × 2
##   product_id total_sales
##        <int>       <dbl>
## 1     995242      41379.
q5 <- coupons %>%
  filter(campaign_id == 18, coupon_upc == 10000089238) %>%
  inner_join(products, by = "product_id") %>%
  distinct(product_category)

q5
## # A tibble: 1 × 1
##   product_category
##   <chr>           
## 1 SMOKED MEATS
front_display_trans <- promotions %>%
  filter(display_location == 1) %>%
  distinct(product_id, store_id, week) %>%  
  inner_join(transactions, by = c("product_id", "store_id", "week"))

q4_results <- front_display_trans %>%
  group_by(product_id) %>%
  summarise(total_sales = sum(sales_value, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(total_sales))

q4_results %>% slice(1)
## # A tibble: 1 × 2
##   product_id total_sales
##        <int>       <dbl>
## 1    5569230        954.
head(q4_results)
## # A tibble: 6 × 2
##   product_id total_sales
##        <int>       <dbl>
## 1    5569230        954.
## 2    5569471        534.
## 3    8090537        432.
## 4    8090521        396.
## 5    1065538        254.
## 6    5569845        214.

5569230 is the top product ID but this is the largest total sales 953.62 according to dataset.

pizza_products <- products %>%
  mutate(product_id = as.integer(product_id)) %>%   # 🔧 FIX
  filter(str_detect(product_type, regex("pizza", ignore_case = TRUE))) %>%
  select(product_id, product_type) %>%
  distinct()

q6 <- pizza_products %>%
  inner_join(transactions, by = "product_id") %>%
  group_by(product_id, product_type) %>%
  summarise(
    total_sales = sum(sales_value, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(total_sales))

q6 %>% slice(1:20)
## # A tibble: 20 × 3
##    product_id product_type          total_sales
##         <int> <chr>                       <dbl>
##  1     944139 PIZZA/TRADITIONAL           1344.
##  2     906838 PIZZA/PREMIUM               1264.
##  3   12648296 PIZZA/TRADITIONAL           1230.
##  4     969568 PIZZA/TRADITIONAL           1125.
##  5     925626 PIZZA/ECONOMY               1018.
##  6    1127624 PIZZA/ECONOMY                849 
##  7    1021116 PIZZA/TRADITIONAL            783.
##  8    1009368 PIZZA/PREMIUM                718.
##  9    9524291 PREP FD: PIZZA (COLD)        717.
## 10     995628 PIZZA/TRADITIONAL            703.
## 11    9521679 PREP FD: PIZZA (COLD)        644.
## 12    1124695 PIZZA/TRADITIONAL            625.
## 13     935284 PIZZA/ECONOMY                621 
## 14    1004596 PIZZA/PREMIUM                618.
## 15     898466 PIZZA/PREMIUM                591.
## 16    1134810 PIZZA/TRADITIONAL            555.
## 17    6773204 PIZZA/ECONOMY                537.
## 18     851819 PIZZA/TRADITIONAL            533.
## 19    9521787 PREP FD: PIZZA (COLD)        521.
## 20    1083721 PIZZA/ECONOMY                518
q6 %>% slice(1)
## # A tibble: 1 × 3
##   product_id product_type      total_sales
##        <int> <chr>                   <dbl>
## 1     944139 PIZZA/TRADITIONAL       1344.
pb <- products %>%
  mutate(product_id = as.integer(product_id)) %>%   
  filter(str_detect(product_type, regex("peanut butter", ignore_case = TRUE))) %>%
  select(product_id, product_type) %>%
  distinct()

pb %>% tally()   
## # A tibble: 1 × 1
##       n
##   <int>
## 1   144
q8 <- pb %>%
  inner_join(transactions, by = "product_id") %>%
  group_by(month = month(transaction_timestamp, label = TRUE)) %>%
  summarise(
    total_sales = sum(sales_value, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(total_sales))

q8
## # A tibble: 12 × 2
##    month total_sales
##    <ord>       <dbl>
##  1 Dec         1014.
##  2 Sep          903.
##  3 Aug          895.
##  4 Nov          886.
##  5 Oct          881.
##  6 Jun          872.
##  7 Jan          872.
##  8 Jul          858.
##  9 May          845.
## 10 Apr          794.
## 11 Feb          747.
## 12 Mar          735.
q8 %>% slice(1)
## # A tibble: 1 × 2
##   month total_sales
##   <ord>       <dbl>
## 1 Dec         1014.
q9 <- coupon_redemptions %>%
  filter(campaign_id == 18, coupon_upc == "10000085475") %>%
  distinct(household_id) %>%
  summarise(households_redeemed = n())

q9
## # A tibble: 1 × 1
##   households_redeemed
##                 <int>
## 1                  63
library(dplyr)

products <- products %>% mutate(product_id = as.integer(product_id))
coupons  <- coupons  %>% mutate(product_id = as.integer(product_id))

coupon_products <- coupons %>%
  filter(campaign_id == 18, coupon_upc == "10000085475") %>%
  distinct(campaign_id, coupon_upc, product_id)

redeems <- coupon_redemptions %>%
  filter(campaign_id == 18, coupon_upc == "10000085475") %>%
  distinct(household_id, campaign_id, coupon_upc, redemption_date)

q10 <- redeems %>%
  inner_join(coupon_products, by = c("campaign_id", "coupon_upc"), relationship = "many-to-many") %>%
  inner_join(products, by = "product_id") %>%
  inner_join(transactions, by = c("household_id", "product_id")) %>%
  filter(as.Date(transaction_timestamp) == as.Date(redemption_date)) %>%
  distinct(household_id, product_id, transaction_timestamp, sales_value, quantity, product_type) %>%
  group_by(product_type) %>%
  summarise(total_sales = sum(sales_value, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(total_sales))

q10 %>% slice(1)
## # A tibble: 1 × 2
##   product_type total_sales
##   <chr>              <dbl>
## 1 BANANAS             37.4
q10 %>% slice(1:20)
## # A tibble: 20 × 2
##    product_type                  total_sales
##    <chr>                               <dbl>
##  1 BANANAS                              37.4
##  2 GRAPES RED                           33.0
##  3 POTATOES RUSSET (BULK&BAG)           29.2
##  4 CARROTS MINI PEELED                  23.8
##  5 CELERY                               22.1
##  6 GARDEN PLUS                          17.2
##  7 APPLES GALA (BULK&BAG)               16.8
##  8 DRIED FRUIT - OTHER                  16.5
##  9 TOMATOES HOTHOUSE ON THE VINE        15.3
## 10 PINEAPPLE WHOLE&PEEL/CORED           14.5
## 11 ORANGES NAVELS ALL                   14.2
## 12 BROCCOLI WHOLE&CROWNS                13.2
## 13 BLENDS                               13.0
## 14 STRAWBERRIES                         12.0
## 15 HEAD LETTUCE                         11.9
## 16 ONIONS YELLOW (BULK&BAG)             11.7
## 17 CANTALOUPE WHOLE                     11.0
## 18 ONIONS SWEET (BULK&BAG)              10.8
## 19 GRAPES WHITE                         10.4
## 20 APPLES FUJI (BULK&BAG)               10.4
q10 %>%
  filter(product_type %in% c("ORGANIC APPLES", "ASPARAGUS", "CELERY", "VARIETY LETTUCE")) %>%
  arrange(desc(total_sales))
## # A tibble: 4 × 2
##   product_type    total_sales
##   <chr>                 <dbl>
## 1 CELERY                22.1 
## 2 VARIETY LETTUCE        9.16
## 3 ORGANIC APPLES         6.98
## 4 ASPARAGUS              3.67