library(tidyverse)
library(lubridate)
library(completejourney)
transactions <- get_transactions()
promotions <- get_promotions()
demographics <- completejourney::demographics
products <- completejourney::products
glimpse(transactions)
## Rows: 1,469,307
## Columns: 11
## $ household_id <chr> "900", "900", "1228", "906", "906", "906", "906"…
## $ store_id <chr> "330", "330", "406", "319", "319", "319", "319",…
## $ basket_id <chr> "31198570044", "31198570047", "31198655051", "31…
## $ product_id <chr> "1095275", "9878513", "1041453", "1020156", "105…
## $ quantity <dbl> 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ sales_value <dbl> 0.50, 0.99, 1.43, 1.50, 2.78, 5.49, 1.50, 1.88, …
## $ retail_disc <dbl> 0.00, 0.10, 0.15, 0.29, 0.80, 0.50, 0.29, 0.21, …
## $ coupon_disc <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coupon_match_disc <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ week <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ transaction_timestamp <dttm> 2017-01-01 06:53:26, 2017-01-01 07:10:28, 2017-…
glimpse(promotions)
## Rows: 20,940,529
## Columns: 5
## $ product_id <chr> "1000050", "1000050", "1000050", "1000092", "1000092"…
## $ store_id <chr> "316", "337", "441", "292", "293", "295", "298", "299…
## $ display_location <fct> 9, 3, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ mailer_location <fct> 0, 0, 0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,…
## $ week <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
glimpse(products)
## Rows: 92,331
## Columns: 7
## $ product_id <chr> "25671", "26081", "26093", "26190", "26355", "26426",…
## $ manufacturer_id <chr> "2", "2", "69", "69", "69", "69", "69", "69", "69", "…
## $ department <chr> "GROCERY", "MISCELLANEOUS", "PASTRY", "GROCERY", "GRO…
## $ brand <fct> National, National, Private, Private, Private, Privat…
## $ product_category <chr> "FRZN ICE", NA, "BREAD", "FRUIT - SHELF STABLE", "COO…
## $ product_type <chr> "ICE - CRUSHED/CUBED", NA, "BREAD:ITALIAN/FRENCH", "A…
## $ package_size <chr> "22 LB", NA, NA, "50 OZ", "14 OZ", "2.5 OZ", "16 OZ",…
glimpse(demographics)
## Rows: 801
## Columns: 8
## $ household_id <chr> "1", "1001", "1003", "1004", "101", "1012", "1014", "10…
## $ age <ord> 65+, 45-54, 35-44, 25-34, 45-54, 35-44, 45-54, 45-54, 4…
## $ income <ord> 35-49K, 50-74K, 25-34K, 15-24K, Under 15K, 35-49K, 15-2…
## $ home_ownership <ord> Homeowner, Homeowner, NA, NA, Homeowner, NA, NA, Homeow…
## $ marital_status <ord> Married, Unmarried, Unmarried, Unmarried, Married, Marr…
## $ household_size <ord> 2, 1, 1, 1, 4, 5+, 4, 1, 5+, 2, 5+, 4, 2, 1, 5+, 1, 1, …
## $ household_comp <ord> 2 Adults No Kids, 1 Adult No Kids, 1 Adult No Kids, 1 A…
## $ kids_count <ord> 0, 0, 0, 0, 2, 3+, 2, 0, 3+, 0, 3+, 2, 0, 0, 3+, 0, 0, …
names(transactions)
## [1] "household_id" "store_id" "basket_id"
## [4] "product_id" "quantity" "sales_value"
## [7] "retail_disc" "coupon_disc" "coupon_match_disc"
## [10] "week" "transaction_timestamp"
names(demographics)
## [1] "household_id" "age" "income" "home_ownership"
## [5] "marital_status" "household_size" "household_comp" "kids_count"
trans_demo <- transactions %>%
left_join(demographics, by = "household_id")
dim(trans_demo)
## [1] 1469307 18
After having joined the transactions with demographics by household_id, new dataset has 1,469,307 rows and 18 columns.
trans_demo_prod <- trans_demo %>%
left_join(products, by = "product_id")
dim(trans_demo_prod)
## [1] 1469307 24
After having joined in products id, this dataset still has 1,469,307 rows but now has 24 columns.
final_data <- trans_demo_prod %>%
left_join(promotions, by = c("product_id", "store_id", "week"))
## Warning in left_join(., promotions, by = c("product_id", "store_id", "week")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 25 of `x` matches multiple rows in `y`.
## ℹ Row 352741 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
dim(final_data)
## [1] 1470002 26
After having joined promotions, the rows increased to 1470002 and 26 columns.
trans_with_demo <- transactions %>%
inner_join(demographics, by = "household_id")
dim(trans_with_demo)
## [1] 828850 18
trans_without_demo <- transactions %>%
anti_join(demographics, by = "household_id")
dim(trans_without_demo)
## [1] 640457 11
Out of 1,469,307 total transactions, 828,850 have demographic information, while 640,457 transactions do not.
households_with_demo <- transactions %>%
semi_join(demographics, by = "household_id") %>%
distinct(household_id)
nrow(households_with_demo)
## [1] 801
There are 801 unique households with demographic info
households_without_demo <- transactions %>%
anti_join(demographics, by = "household_id") %>%
distinct(household_id)
nrow(households_without_demo)
## [1] 1668
There are 1,668 unique households without demographic info
household_spend <- transactions %>%
inner_join(demographics, by = "household_id") %>%
group_by(household_id, income) %>%
summarise(total_spend = sum(sales_value), .groups = "drop")
high_spend_households <- household_spend %>%
filter(total_spend >= 1000)
high_spend_households %>%
count(income) %>%
arrange(desc(n))
## # A tibble: 12 × 2
## income n
## <ord> <int>
## 1 50-74K 185
## 2 35-49K 154
## 3 75-99K 88
## 4 25-34K 69
## 5 15-24K 67
## 6 Under 15K 58
## 7 125-149K 36
## 8 100-124K 33
## 9 150-174K 27
## 10 175-199K 10
## 11 250K+ 10
## 12 200-249K 5
The income range with most household spending of 1,000 or more is 50-74k with 185 households.
n_distinct(transactions$household_id)
## [1] 2469
there are 2469 unique households in transactions data.
library(dplyr)
library(completejourney)
transactions <- get_transactions()
promotions <- get_promotions()
dim(transactions)
## [1] 1469307 11
dim(promotions)
## [1] 20940529 5
transactions <- transactions %>%
mutate(
product_id = as.integer(product_id),
store_id = as.integer(store_id),
week = as.integer(week)
)
promotions <- promotions %>%
mutate(
product_id = as.integer(product_id),
store_id = as.integer(store_id),
week = as.integer(week)
)
front_display_trans <- promotions %>%
filter(display_location == 1) %>%
distinct(product_id, store_id, week) %>%
inner_join(transactions, by = c("product_id", "store_id", "week"))
q4_results <- front_display_trans %>%
group_by(product_id) %>%
summarise(total_front_display_sales = sum(sales_value, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(total_front_display_sales))
q4_results %>% slice(1)
## # A tibble: 1 × 2
## product_id total_front_display_sales
## <int> <dbl>
## 1 5569230 954.
dim(transactions); dim(promotions)
## [1] 1469307 11
## [1] 20940529 5
sapply(transactions[c("product_id","store_id","week")], class)
## product_id store_id week
## "integer" "integer" "integer"
nrow(front_display_trans)
## [1] 11134
with_demo <- transactions %>%
inner_join(demographics, by = "household_id") %>%
tally()
without_demo <- transactions %>%
anti_join(demographics, by = "household_id") %>%
tally()
with_demo
## # A tibble: 1 × 1
## n
## <int>
## 1 828850
without_demo
## # A tibble: 1 × 1
## n
## <int>
## 1 640457
q2_age_sales <- transactions %>%
inner_join(demographics, by = "household_id") %>%
group_by(age) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(total_sales))
q2_age_sales
## # A tibble: 6 × 2
## age total_sales
## <ord> <dbl>
## 1 45-54 971822.
## 2 35-44 724357.
## 3 25-34 453372.
## 4 65+ 176601.
## 5 55-64 173154.
## 6 19-24 125673.
q2_age_sales %>% slice(1)
## # A tibble: 1 × 2
## age total_sales
## <ord> <dbl>
## 1 45-54 971822.
hshld_1000 <- transactions %>%
group_by(household_id) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE), .groups = "drop") %>%
filter(total_sales >= 1000)
hshld_1000 %>%
inner_join(demographics, by = "household_id") %>%
tally()
## # A tibble: 1 × 1
## n
## <int>
## 1 742
q3_income_counts <- hshld_1000 %>%
inner_join(demographics, by = "household_id") %>%
count(income, sort = TRUE)
q3_income_counts
## # A tibble: 12 × 2
## income n
## <ord> <int>
## 1 50-74K 185
## 2 35-49K 154
## 3 75-99K 88
## 4 25-34K 69
## 5 15-24K 67
## 6 Under 15K 58
## 7 125-149K 36
## 8 100-124K 33
## 9 150-174K 27
## 10 175-199K 10
## 11 250K+ 10
## 12 200-249K 5
q3_income_counts %>% slice(1)
## # A tibble: 1 × 2
## income n
## <ord> <int>
## 1 50-74K 185
front_products <- promotions %>%
filter(display_location == 1) %>%
distinct(product_id)
q4_results <- transactions %>%
semi_join(front_products, by = "product_id") %>%
group_by(product_id) %>%
summarise(
total_sales = sum(sales_value * quantity, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(total_sales))
q4_results %>% slice(1)
## # A tibble: 1 × 2
## product_id total_sales
## <int> <dbl>
## 1 995242 41379.
q5 <- coupons %>%
filter(campaign_id == 18, coupon_upc == 10000089238) %>%
inner_join(products, by = "product_id") %>%
distinct(product_category)
q5
## # A tibble: 1 × 1
## product_category
## <chr>
## 1 SMOKED MEATS
front_display_trans <- promotions %>%
filter(display_location == 1) %>%
distinct(product_id, store_id, week) %>%
inner_join(transactions, by = c("product_id", "store_id", "week"))
q4_results <- front_display_trans %>%
group_by(product_id) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(total_sales))
q4_results %>% slice(1)
## # A tibble: 1 × 2
## product_id total_sales
## <int> <dbl>
## 1 5569230 954.
head(q4_results)
## # A tibble: 6 × 2
## product_id total_sales
## <int> <dbl>
## 1 5569230 954.
## 2 5569471 534.
## 3 8090537 432.
## 4 8090521 396.
## 5 1065538 254.
## 6 5569845 214.
5569230 is the top product ID but this is the largest total sales 953.62 according to dataset.
pizza_products <- products %>%
mutate(product_id = as.integer(product_id)) %>% # 🔧 FIX
filter(str_detect(product_type, regex("pizza", ignore_case = TRUE))) %>%
select(product_id, product_type) %>%
distinct()
q6 <- pizza_products %>%
inner_join(transactions, by = "product_id") %>%
group_by(product_id, product_type) %>%
summarise(
total_sales = sum(sales_value, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(total_sales))
q6 %>% slice(1:20)
## # A tibble: 20 × 3
## product_id product_type total_sales
## <int> <chr> <dbl>
## 1 944139 PIZZA/TRADITIONAL 1344.
## 2 906838 PIZZA/PREMIUM 1264.
## 3 12648296 PIZZA/TRADITIONAL 1230.
## 4 969568 PIZZA/TRADITIONAL 1125.
## 5 925626 PIZZA/ECONOMY 1018.
## 6 1127624 PIZZA/ECONOMY 849
## 7 1021116 PIZZA/TRADITIONAL 783.
## 8 1009368 PIZZA/PREMIUM 718.
## 9 9524291 PREP FD: PIZZA (COLD) 717.
## 10 995628 PIZZA/TRADITIONAL 703.
## 11 9521679 PREP FD: PIZZA (COLD) 644.
## 12 1124695 PIZZA/TRADITIONAL 625.
## 13 935284 PIZZA/ECONOMY 621
## 14 1004596 PIZZA/PREMIUM 618.
## 15 898466 PIZZA/PREMIUM 591.
## 16 1134810 PIZZA/TRADITIONAL 555.
## 17 6773204 PIZZA/ECONOMY 537.
## 18 851819 PIZZA/TRADITIONAL 533.
## 19 9521787 PREP FD: PIZZA (COLD) 521.
## 20 1083721 PIZZA/ECONOMY 518
q6 %>% slice(1)
## # A tibble: 1 × 3
## product_id product_type total_sales
## <int> <chr> <dbl>
## 1 944139 PIZZA/TRADITIONAL 1344.
pb <- products %>%
mutate(product_id = as.integer(product_id)) %>%
filter(str_detect(product_type, regex("peanut butter", ignore_case = TRUE))) %>%
select(product_id, product_type) %>%
distinct()
pb %>% tally()
## # A tibble: 1 × 1
## n
## <int>
## 1 144
q8 <- pb %>%
inner_join(transactions, by = "product_id") %>%
group_by(month = month(transaction_timestamp, label = TRUE)) %>%
summarise(
total_sales = sum(sales_value, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(total_sales))
q8
## # A tibble: 12 × 2
## month total_sales
## <ord> <dbl>
## 1 Dec 1014.
## 2 Sep 903.
## 3 Aug 895.
## 4 Nov 886.
## 5 Oct 881.
## 6 Jun 872.
## 7 Jan 872.
## 8 Jul 858.
## 9 May 845.
## 10 Apr 794.
## 11 Feb 747.
## 12 Mar 735.
q8 %>% slice(1)
## # A tibble: 1 × 2
## month total_sales
## <ord> <dbl>
## 1 Dec 1014.
q9 <- coupon_redemptions %>%
filter(campaign_id == 18, coupon_upc == "10000085475") %>%
distinct(household_id) %>%
summarise(households_redeemed = n())
q9
## # A tibble: 1 × 1
## households_redeemed
## <int>
## 1 63
library(dplyr)
products <- products %>% mutate(product_id = as.integer(product_id))
coupons <- coupons %>% mutate(product_id = as.integer(product_id))
coupon_products <- coupons %>%
filter(campaign_id == 18, coupon_upc == "10000085475") %>%
distinct(campaign_id, coupon_upc, product_id)
redeems <- coupon_redemptions %>%
filter(campaign_id == 18, coupon_upc == "10000085475") %>%
distinct(household_id, campaign_id, coupon_upc, redemption_date)
q10 <- redeems %>%
inner_join(coupon_products, by = c("campaign_id", "coupon_upc"), relationship = "many-to-many") %>%
inner_join(products, by = "product_id") %>%
inner_join(transactions, by = c("household_id", "product_id")) %>%
filter(as.Date(transaction_timestamp) == as.Date(redemption_date)) %>%
distinct(household_id, product_id, transaction_timestamp, sales_value, quantity, product_type) %>%
group_by(product_type) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(total_sales))
q10 %>% slice(1)
## # A tibble: 1 × 2
## product_type total_sales
## <chr> <dbl>
## 1 BANANAS 37.4
q10 %>% slice(1:20)
## # A tibble: 20 × 2
## product_type total_sales
## <chr> <dbl>
## 1 BANANAS 37.4
## 2 GRAPES RED 33.0
## 3 POTATOES RUSSET (BULK&BAG) 29.2
## 4 CARROTS MINI PEELED 23.8
## 5 CELERY 22.1
## 6 GARDEN PLUS 17.2
## 7 APPLES GALA (BULK&BAG) 16.8
## 8 DRIED FRUIT - OTHER 16.5
## 9 TOMATOES HOTHOUSE ON THE VINE 15.3
## 10 PINEAPPLE WHOLE&PEEL/CORED 14.5
## 11 ORANGES NAVELS ALL 14.2
## 12 BROCCOLI WHOLE&CROWNS 13.2
## 13 BLENDS 13.0
## 14 STRAWBERRIES 12.0
## 15 HEAD LETTUCE 11.9
## 16 ONIONS YELLOW (BULK&BAG) 11.7
## 17 CANTALOUPE WHOLE 11.0
## 18 ONIONS SWEET (BULK&BAG) 10.8
## 19 GRAPES WHITE 10.4
## 20 APPLES FUJI (BULK&BAG) 10.4
q10 %>%
filter(product_type %in% c("ORGANIC APPLES", "ASPARAGUS", "CELERY", "VARIETY LETTUCE")) %>%
arrange(desc(total_sales))
## # A tibble: 4 × 2
## product_type total_sales
## <chr> <dbl>
## 1 CELERY 22.1
## 2 VARIETY LETTUCE 9.16
## 3 ORGANIC APPLES 6.98
## 4 ASPARAGUS 3.67