library(tidyverse)
library(completejourney)
transactions <- get_transactions()
products
## # A tibble: 92,331 × 7
## product_id manufacturer_id department brand product_category product_type
## <chr> <chr> <chr> <fct> <chr> <chr>
## 1 25671 2 GROCERY Natio… FRZN ICE ICE - CRUSH…
## 2 26081 2 MISCELLANEOUS Natio… <NA> <NA>
## 3 26093 69 PASTRY Priva… BREAD BREAD:ITALI…
## 4 26190 69 GROCERY Priva… FRUIT - SHELF S… APPLE SAUCE
## 5 26355 69 GROCERY Priva… COOKIES/CONES SPECIALTY C…
## 6 26426 69 GROCERY Priva… SPICES & EXTRAC… SPICES & SE…
## 7 26540 69 GROCERY Priva… COOKIES/CONES TRAY PACK/C…
## 8 26601 69 DRUG GM Priva… VITAMINS VITAMIN - M…
## 9 26636 69 PASTRY Priva… BREAKFAST SWEETS SW GDS: SW …
## 10 26691 16 GROCERY Priva… PNT BTR/JELLY/J… HONEY
## # ℹ 92,321 more rows
## # ℹ 1 more variable: package_size <chr>
demographics
## # A tibble: 801 × 8
## household_id age income home_ownership marital_status household_size
## <chr> <ord> <ord> <ord> <ord> <ord>
## 1 1 65+ 35-49K Homeowner Married 2
## 2 1001 45-54 50-74K Homeowner Unmarried 1
## 3 1003 35-44 25-34K <NA> Unmarried 1
## 4 1004 25-34 15-24K <NA> Unmarried 1
## 5 101 45-54 Under 15K Homeowner Married 4
## 6 1012 35-44 35-49K <NA> Married 5+
## 7 1014 45-54 15-24K <NA> Married 4
## 8 1015 45-54 50-74K Homeowner Unmarried 1
## 9 1018 45-54 35-49K Homeowner Married 5+
## 10 1020 45-54 25-34K Homeowner Married 2
## # ℹ 791 more rows
## # ℹ 2 more variables: household_comp <ord>, kids_count <ord>
transactions
## # A tibble: 1,469,307 × 11
## household_id store_id basket_id product_id quantity sales_value retail_disc
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 900 330 31198570044 1095275 1 0.5 0
## 2 900 330 31198570047 9878513 1 0.99 0.1
## 3 1228 406 31198655051 1041453 1 1.43 0.15
## 4 906 319 31198705046 1020156 1 1.5 0.29
## 5 906 319 31198705046 1053875 2 2.78 0.8
## 6 906 319 31198705046 1060312 1 5.49 0.5
## 7 906 319 31198705046 1075313 1 1.5 0.29
## 8 1058 381 31198676055 985893 1 1.88 0.21
## 9 1058 381 31198676055 988791 1 1.5 1.29
## 10 1058 381 31198676055 9297106 1 2.69 0
## # ℹ 1,469,297 more rows
## # ℹ 4 more variables: coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>
Plot # 1
transactions %>%
inner_join(demographics, by = "household_id") %>%
group_by(age, income) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
ggplot(aes(x = age, y = total_sales)) +
geom_col(fill = "blue", alpha = 0.5) +
scale_y_continuous(name = "Total Sales per Age Group", labels = scales:: dollar)+
scale_x_discrete(name = "Age") +
labs(
title = "Total Sales for Each Age Group",
subtitle = paste("Analyzing the total sales for each age group based on income
level. It appears that sales peak at the income range of
50-74k with an age of 45-54. The sales drop off at an income
range of 200-249K")
)+
facet_wrap(~income) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

Plot # 2
## Filtering to find the most popular 16 OZ product
top_16oz_product_type <- transactions %>%
inner_join(products, by = "product_id") %>%
filter(package_size == "16 OZ") %>% # Filter only 16oz products
group_by(product_type) %>% # Group by product type
summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>% # Sum sales
arrange(desc(total_sales)) %>% # Sort in descending order
slice(1)
transactions %>%
inner_join(products, by = "product_id") %>%
filter(package_size %in% c("16 OZ", "20 OZ", "24 OZ", "32 OZ")) %>%
ggplot(aes(x = package_size)) +
geom_bar(fill = "orange") +
scale_y_continuous(name = "Count") +
scale_x_discrete(name = "Package Size") +
labs(title = "Count of Transactions by Package Size",
subtitle = "After anaylysing the count of package size, it is easy to see
that the bigger the size the less that are sold. Diving deeper into the most popular
category which is 16 OZ. It was found that Strawberries were the most bought item from
the 16 OZ category. Bringing in 13563.77 dollars in total sales")

Plot # 3
products %>%
inner_join(transactions, by = "product_id") %>%
filter(tolower(product_category) %in% tolower(c("COOKIES/CONES", "ICE CREAM/MILK/SHERBTS", "DRY MIX DESSERTS", "CAKES"))) %>%
group_by(package_size, product_category, brand) %>%
summarise(total_sales = sum(sales_value), .groups = "drop") %>%
ggplot(aes(x = total_sales, y = product_category)) +
geom_point(color = "red", alpha = .4) +
scale_x_log10(name = "Total Sales", labels = scales:: dollar)+
scale_y_discrete(name = "Desserts") +
facet_wrap(~brand) +
labs(
title = "Total Sales for Dessert Products",
subtitle = "Sales Data based on Kroger Desserts and their Total Sales per each
brand. What sticks out in this plot is that in the cookies/cones row the
National sales are spread much further in price range in comparision to
cookie/cones in Private sales"
)
