Libraries

library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)

Plot 1

Finding Top Product Categories

products %>%
  inner_join(transactions_sample) %>%
  group_by(product_category) %>%
  summarize(total_sales = sum(sales_value)) %>%
  arrange(desc(total_sales))
## Joining with `by = join_by(product_id)`

I am going to exclude the one that is just a general miscellaneous.

Plotting

products %>%
  inner_join(transactions_sample) %>%
  inner_join(demographics) %>%
  filter(str_detect(product_category, regex("SOFT DRINKS|BEEF|FLUID MILK PRODUCTS|CHEESE$|FRZN MEAT/MEAT DINNERS"))) %>%
  group_by(product_category, household_comp) %>%
  summarize(total_sales = sum(sales_value)) %>%
  ggplot(aes(x = household_comp, fill = product_category, y = total_sales)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Spending on Product Categories by Household Composition",
        subtitle = "Looking at the Top 5 Product Categories",
        x = "Household Compostion",
        y = "Total Sales ($)",
        fill = "Product Category")
## Joining with `by = join_by(product_id)`
## Joining with `by = join_by(household_id)`
## `summarise()` has grouped output by 'product_category'. You can override using
## the `.groups` argument.

Plot 2

transactions_sample %>%
  inner_join(demographics) %>%
  inner_join(products) %>%
  filter(str_detect(product_category, regex("CANDY", ignore_case = TRUE))) %>%
  group_by(week, age) %>%
  summarize(total_sales = sum(sales_value)) %>%
  ggplot(aes(week, total_sales, group = age, color = age)) +
  geom_line() +
  labs(title = "Candy Spending Accross the Year by Age Group",
        x = "Week",
        y = "Total Sales ($)",
        color = "Age Group") +
  annotate(geom = "label", label = "Largetst Spike in Sales (for Easter?)", x = 15, y = 42)
## Joining with `by = join_by(household_id)`
## Joining with `by = join_by(product_id)`
## `summarise()` has grouped output by 'week'. You can override using the
## `.groups` argument.

Plot 3

Finding Top Product IDs

transactions_sample %>%
  inner_join(products) %>%
  group_by(product_id) %>%
  summarize(total_sales = sum(sales_value)) %>%
  arrange(desc(total_sales))
## Joining with `by = join_by(product_id)`

Plotting

demographics %>%
  inner_join(transactions_sample) %>%
  inner_join(products) %>%
  filter(kids_count >= 1) %>%
  filter(str_detect(product_id, regex( "6534178|6533889|6533765|1029743|6534166|1106523|1082185|995242|916122|1044078"))) %>%
  group_by(product_id) %>%
  summarize(total_sales = sum(sales_value)) %>%
  ggplot(aes(product_id, total_sales)) +
  geom_col() +
  labs(title = "Spending on Products by Households with Kids",
        subtitle = "Looking at the Top 10 Products",
        x = "Product ID",
        y = "Total Sales ($)") +
  annotate(geom = "label", label = "~8 Times Greater Sales than Any Other", x = 8, y = 4200)
## Joining with `by = join_by(household_id)`
## Joining with `by = join_by(product_id)`