This document contains three informative plots to provide unique insights and tell story about the completejourney database.
Calling the library for completejourney dataset.
household_exp <- transactions %>%
left_join(demographics, by = "household_id") %>%
group_by(household_id) %>%
mutate(loyalty_card_price = (sales_value - (retail_disc + coupon_match_disc)) / ifelse(quantity == 0 | is.na(quantity), 1, quantity)) %>%
summarize(household_spending = sum(loyalty_card_price, na.rm = TRUE),
household_income = first(income)) %>%
arrange(desc(household_income)) %>%
na.omit(household_exp)
ggplot(data.frame(household_exp), aes(x = factor(household_income), y = household_spending)) +
geom_jitter(color = "darkblue", size = 1.5, alpha = 0.3, width = 0.2) +
geom_boxplot(outlier.shape = NA, fill = "skyblue", color = "darkblue", alpha = 0.6) + # Boxplot with transparency
stat_boxplot(geom = "errorbar", width = 0.2, color = "darkblue") +
theme_minimal(base_size = 15) + # Clean theme with larger base font size
theme(
plot.title = element_text(hjust = 0.5, size = 15),
plot.subtitle = element_text(hjust = 0.5, size = 11),
axis.title = element_text(size = 12),
axis.text.x = element_text(angle = 60, hjust = 1, size = 9),
panel.grid.major = element_line(color = "lightgray"),
panel.grid.minor = element_blank(),
axis.ticks = element_blank()
) +
labs(
title = "Total spending per total income for each household",
subtitle = "Analyzing the total spending of each household based on their income bracket", # Subtitle added
x = "Household Income",
y = "Household Spending"
) +
scale_y_continuous(labels = scales::dollar_format()) +
scale_x_discrete(labels = function(x) strtrim(x, 10))
demographics %>%
group_by(kids_count, income) %>%
inner_join(transactions, by = "household_id") %>%
summarize(total_sales = sum(sales_value, na.rm = TRUE), .groups = 'drop') %>%
ggplot(aes(x = kids_count, y = total_sales)) +
geom_jitter(aes(color = income), width = 0.2, height = 0) +
scale_y_continuous(name = "Total Sales", labels = scales::dollar_format()) +
labs(
title = "Total Sales of All Products per Income Range per and Kids Count",
subtitle = "Comparing the total sales of all products amongst income ranges and kids count",
x = "Kids Count",
y = "Total Sales",
color = "Income"
) +
theme_minimal()
lollipop_data <- transactions %>%
left_join(products, by = "product_id") %>%
group_by(product_category) %>%
summarize(avg_spending = mean(sales_value, na.rm = TRUE)) %>%
arrange(desc(avg_spending)) %>% # Sort by average spending in descending order
slice_head(n = 5) # Select the top 5 product categories
ggplot(lollipop_data, aes(x = reorder(product_category, avg_spending), y = avg_spending)) +
geom_point(size = 4, color = "blue") +
geom_segment(aes(x = product_category, xend = product_category, y = 0, yend = avg_spending), color = "blue") +
coord_flip() +
labs(
title = "Average Spending by Top 5 Product Categories",
subtitle = "Highlighting the highest average spending across product categories", # Added subtitle
x = "Product Category",
y = "Average Spending ($)"
) +
theme_minimal()