week5_lab_Sam_Johnson.html

output: html_document


Join transactions, products, and demographics datasets

df1 <- transactions_sample %>%
  inner_join(products, by = "product_id") %>%
  inner_join(demographics, by = "household_id") %>%
  filter(income %in% c("Under 35K", "35-49K", "50-74K", "75-99K", "100K+")) %>%
  group_by(income, product_category) %>%
  summarise(total_sales = sum(sales_value)) %>%
  arrange(desc(total_sales)) %>%
  group_by(income) %>%
  slice_head(n = 5) 

Plot total sales by product category for different income levels

ggplot(df1, aes(x = reorder(product_category, total_sales), y = total_sales, fill = income)) +
  geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  labs(title = "Top 5 Product Categories by Income Level",
       subtitle = "Total sales value for top categories, grouped by income",
       x = "Product Category", y = "Total Sales Value ($)",
       caption = "Data: CompleteJourney") +
  theme_minimal()

Aggregate data by household size and date

df2 <- transactions_sample %>%
  inner_join(demographics, by = "household_id") %>%
  group_by(week, household_size) %>%
  summarise(total_sales = sum(sales_value))
## `summarise()` has grouped output by 'week'. You can override using the
## `.groups` argument.

Plot total sales over time by household size

ggplot(df2, aes(x = week, y = total_sales, color = household_size)) +
  geom_line(size = 1) +
  labs(title = "Shopping Patterns Over Time by Household Size",
       subtitle = "Total sales value across different household sizes",
       x = "Week", y = "Total Sales Value ($)",
       caption = "Data: CompleteJourney") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Join transactions and products datasets and filter by top products for different household sizes

df3 <- transactions_sample %>%
  inner_join(products, by = "product_id") %>%
  inner_join(demographics, by = "household_id") %>%
  group_by(household_size, product_category) %>%
  summarise(total_quantity = sum(quantity)) %>%
  arrange(desc(total_quantity)) %>%
  group_by(household_size) %>%
  slice_head(n = 5) # Top 5 products by household size
## `summarise()` has grouped output by 'household_size'. You can override using
## the `.groups` argument.

Plot total quantity purchased by product category for different household sizes

ggplot(df3, aes(x = reorder(product_category, total_quantity), y = total_quantity, fill = household_size)) +
  geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  labs(title = "Top Products by Purchase Quantity for Household Sizes",
       subtitle = "Top 5 products by quantity purchased, grouped by household size",
       x = "Product Category", y = "Total Quantity Purchased",
       caption = "Data: CompleteJourney") +
  theme_minimal()