## Which Product Categories are the Most Popular?
joined_data2 <- transactions %>%
inner_join(products, by = "product_id")
popular_categories <- joined_data2 %>%
group_by(product_category) %>%
summarize(transaction_count = n()) %>%
arrange(desc(transaction_count))
top_10_categories <- popular_categories %>%
slice_head(n = 10)
ggplot(top_10_categories, aes(x = reorder(product_category, transaction_count), y = transaction_count)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
labs(
title = "Most Popular Product Categories",
x = "Product Category",
y = "Number of Transactions"
) +
theme_classic()
## What is the Average Candy Sales and Is There a Trend by Day of the Week?
joined_data1 <- products %>%
inner_join(transactions)
## Joining with `by = join_by(product_id)`
candy <- joined_data1 %>%
filter(str_detect(product_category, regex("CANDY - CHECKLANE", ignore_case = TRUE))) %>%
filter(!product_type %in% c("SEASONAL CANDY BAGS-CHOCOLATE", "NOVELTY CANDY", "MISC CHECKLANE CANDY",
"CANDY & BREATH MINTS (PKGD) (N"))
candy_sales <- candy %>%
inner_join(transactions) %>%
mutate(day = lubridate::wday(transaction_timestamp, label = TRUE)) %>%
group_by(day, product_type) %>%
summarize(total_sales = sum(sales_value, na.rm = TRUE), .groups = 'drop')
## Joining with `by = join_by(product_id, household_id, store_id, basket_id,
## quantity, sales_value, retail_disc, coupon_disc, coupon_match_disc, week,
## transaction_timestamp)`
ggplot(candy_sales, aes(x = day, y = total_sales, fill = product_type)) +
geom_bar(stat = "identity", position = "dodge") +
scale_y_continuous(labels = scales::dollar) +
labs(title = "Total Candy Sales by Day of the Week and Product Category",
x = "Day of the Week",
y = "Total Sales Value") +
theme_bw() +
scale_fill_manual(values = my_colors)
## How Does Household Composition Affect Basket Size and Frequency of Purchases?
joined_data <- transactions %>%
inner_join(demographics, by = "household_id")
summary_stats <- joined_data %>%
group_by(household_id) %>%
summarize(total_sales = sum(sales_value, na.rm = TRUE))
frequency_of_purchases <- joined_data %>%
group_by(household_id) %>%
summarize(purchase_frequency = n_distinct(basket_id))
basket_size <- summary_stats %>%
left_join(frequency_of_purchases, by = "household_id") %>%
left_join(demographics %>% select(household_id, age), by = "household_id")
ggplot(basket_size, aes(x = purchase_frequency, y = total_sales, fill = age)) +
geom_boxplot(alpha = .5) +
scale_fill_manual(values = my_colors1) +
facet_grid(~ age) +
labs(
title = "Total Sales vs. Frequency of Purchases",
x = "Frequency of Purchases",
y = "Total Sales") +
theme_linedraw()