1. Plot: Top 5 products by households with and without children
# Load required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(completejourney) # Assuming the completejourney package is loaded
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
library(ggplot2)
library(dplyr)
library(completejourney)
data(transactions)
# Load datasets from the completejourney package
transactions <- get_transactions()
# transactions <- completejourney::transactions
demographics <- completejourney::demographics
products <- completejourney::products
# 1. Plot: Top 5 products by households with and without children
# Data preparation
child_status_sales <- transactions %>%
left_join(demographics, by = "household_id") %>%
left_join(products, by = "product_id") %>%
filter(!is.na(kids_count)) %>%
group_by(kids_count, product_category) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
top_n(5, total_sales) %>%
arrange(kids_count, desc(total_sales))
## `summarise()` has grouped output by 'kids_count'. You can override using the
## `.groups` argument.
# Create plot
ggplot(child_status_sales, aes(x = reorder(product_category, total_sales), y = total_sales, fill = as.factor(kids_count))) +
geom_col(position = "dodge") +
coord_flip() +
labs(title = "Top 5 Products for Households with and without Kids",
subtitle = "Comparison of sales value for top 5 products",
x = "Product Category", y = "Total Sales (USD)", fill = "Kids Status",
caption = "Data: CompleteJourney") +
theme_minimal()

2. Plot: Total Sales Value by Age Group
# 2. Plot: Total Sales Value by Age Group
# Data preparation
age_group_sales <- transactions %>%
left_join(demographics, by = "household_id") %>%
left_join(products, by = "product_id") %>%
group_by(age) %>% # Assuming 'age_group' is the name of the column representing age groups
summarise(total_sales = sum(sales_value, na.rm = TRUE), .groups = 'drop') %>%
arrange(desc(total_sales)) # Arrange by total sales
# Create plot
ggplot(age_group_sales, aes(x = reorder(age, total_sales), y = total_sales, fill = age)) +
geom_col() + # Create a bar chart
coord_flip() + # Flip coordinates for better readability
labs(
title = "Total Sales Value by Age Group",
subtitle = "Analysis of total sales across different age groups",
x = "Age Group",
y = "Total Sales Value (USD)",
fill = "Age Group",
caption = "Data: CompleteJourney"
) +
theme_minimal() + # Use a minimal theme
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Adjust x-axis text if needed

3. Plot: Top 5 products by income level
# 3.Data preparation: Top 5 products by income level
income_level_top_products <- transactions %>%
left_join(demographics, by = "household_id") %>%
left_join(products, by = "product_id") %>%
group_by(income, product_category) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE), total_quantity = sum(quantity, na.rm = TRUE), .groups = 'drop') %>%
arrange(income, desc(total_sales)) %>%
group_by(income) %>%
top_n(5, total_sales) %>%
ungroup() %>%
mutate(sales_per_quantity = total_sales / total_quantity)
# Create plot: Total sales value per quantity for top 5 products in each income level
ggplot(income_level_top_products, aes(x = reorder(product_category, sales_per_quantity), y = sales_per_quantity, fill = income)) +
geom_col(position = "dodge") +
coord_flip() +
labs(title = "Top 5 Products by Income Level",
subtitle = "Total Sales Value per Quantity for Top 5 Products Purchased by Each Income Level",
x = "Product Category", y = "Total Sales per Quantity (USD)", fill = "Income Level",
caption = "Data: CompleteJourney") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

4. Plot: Purchase frequency vs. total spend by household size
# 4. Plot: Purchase frequency vs. total spend by household size
# Data preparation
household_size_sales <- transactions %>%
left_join(demographics, by = "household_id") %>%
group_by(household_size) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE),
purchase_frequency = n()) %>%
arrange(desc(total_sales))
# Create plot
ggplot(household_size_sales, aes(x = purchase_frequency, y = total_sales, color = as.factor(household_size))) +
geom_point(size = 3) +
labs(title = "Purchase Frequency vs Total Spend by Household Size",
subtitle = "Relationship between how often households purchase and how much they spend",
x = "Purchase Frequency", y = "Total Sales (USD)", color = "Household Size",
caption = "Data: CompleteJourney") +
theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

5. Plot: Sales distribution by household income level
# 5. Plot: Sales distribution by household income level
# Data preparation
income_sales <- transactions %>%
left_join(demographics, by = "household_id") %>%
group_by(income) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
arrange(desc(total_sales))
# Create plot
ggplot(income_sales, aes(x = reorder(income, total_sales), y = total_sales)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Total Sales by Income Level",
subtitle = "Distribution of total sales value across different income levels",
x = "Income Level", y = "Total Sales (USD$)",
caption = "Data: CompleteJourney") +
theme_minimal()
