week5_lab_Sam_Johnson.html
output: html_document
Join transactions, products, and demographics datasets
df1 <- transactions_sample %>%
inner_join(products, by = "product_id") %>%
inner_join(demographics, by = "household_id") %>%
filter(income %in% c("Under 35K", "35-49K", "50-74K", "75-99K", "100K+")) %>%
group_by(income, product_category) %>%
summarise(total_sales = sum(sales_value)) %>%
arrange(desc(total_sales)) %>%
group_by(income) %>%
slice_head(n = 5)
Plot total sales by product category for different income
levels
ggplot(df1, aes(x = reorder(product_category, total_sales), y = total_sales, fill = income)) +
geom_bar(stat = "identity", position = "dodge") +
coord_flip() +
labs(title = "Top 5 Product Categories by Income Level",
subtitle = "Total sales value for top categories, grouped by income",
x = "Product Category", y = "Total Sales Value ($)",
caption = "Data: CompleteJourney") +
theme_minimal()

Aggregate data by household size and date
df2 <- transactions_sample %>%
inner_join(demographics, by = "household_id") %>%
group_by(week, household_size) %>%
summarise(total_sales = sum(sales_value))
## `summarise()` has grouped output by 'week'. You can override using the
## `.groups` argument.
Plot total sales over time by household size
ggplot(df2, aes(x = week, y = total_sales, color = household_size)) +
geom_line(size = 1) +
labs(title = "Shopping Patterns Over Time by Household Size",
subtitle = "Total sales value across different household sizes",
x = "Week", y = "Total Sales Value ($)",
caption = "Data: CompleteJourney") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Join transactions and products datasets and filter by top products
for different household sizes
df3 <- transactions_sample %>%
inner_join(products, by = "product_id") %>%
inner_join(demographics, by = "household_id") %>%
group_by(household_size, product_category) %>%
summarise(total_quantity = sum(quantity)) %>%
arrange(desc(total_quantity)) %>%
group_by(household_size) %>%
slice_head(n = 5) # Top 5 products by household size
## `summarise()` has grouped output by 'household_size'. You can override using
## the `.groups` argument.
Plot total quantity purchased by product category for different
household sizes
ggplot(df3, aes(x = reorder(product_category, total_quantity), y = total_quantity, fill = household_size)) +
geom_bar(stat = "identity", position = "dodge") +
coord_flip() +
labs(title = "Top Products by Purchase Quantity for Household Sizes",
subtitle = "Top 5 products by quantity purchased, grouped by household size",
x = "Product Category", y = "Total Quantity Purchased",
caption = "Data: CompleteJourney") +
theme_minimal()
