For this lab we are using the “completejourney” dataset. Here, I have created three plots as mentioned below:

  1. Average Spending by Age Group and Household Size
  2. Customer Segmentation by Spending and Frequency
  3. Total Sales Value per Quantity for Top 5 Products by Income Level

Below we have the code and plots for the same.

Loading the libraries

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
transactions <- transactions_sample

1. Average Spending by Age Group and Household Size

spending_by_demographics <- transactions %>%
  inner_join(demographics, by = "household_id") %>%
  group_by(age, household_size) %>%
  summarize(average_spending = mean(sales_value, na.rm = TRUE), .groups = "drop")

# Create the plot
ggplot(data = spending_by_demographics, aes(x = age, y = average_spending, fill = as.factor(household_size))) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    title = "Average Spending by Age Group and Household Size",
    subtitle = "The bar chart displays average spending across different
    age groups, differentiated by household size. This helps uncover 
    trends such as which age groups and household sizes tend to spend more
    on purchases.", x = "Age Group", 
    y = "Average Spending ($)",
    fill = "Household Size"
  ) +
  theme_minimal() +
  scale_y_continuous(labels = scales::dollar_format())

2. Customer Segmentation by Spending and Frequency

customer_data <- transactions %>%
  group_by(household_id) %>%
  summarize(total_spending = sum(sales_value), purchase_frequency = n()) %>%
  inner_join(demographics, by = "household_id")

# Create the scatter plot
ggplot(customer_data, aes(x = purchase_frequency, y = total_spending, color = household_size)) +
  geom_point(alpha = 0.6, size = 3) +
  labs(
    title = "Customer Segmentation by Spending and Frequency",
    subtitle = "The graph shows how customer spending increases with
    purchase frequency, highlighting the impact of household size on
    spending behavior. Larger households tend to shop more frequently and
    spend more overall.",
    x = "Number of Purchases", 
    y = "Total Spending",
    color = "Household Size"
  ) +
  theme_minimal()

3. Total Sales Value per Quantity for Top 5 Products by Income Level

product_sales_by_income <- transactions %>%
  inner_join(demographics, by = "household_id") %>%
  group_by(income, product_id) %>%
  summarize(
    total_sales_value = sum(sales_value, na.rm = TRUE),
    total_quantity = n(),  # Assuming each transaction is a purchase of one item
    .groups = "drop"
  )

# Calculate sales value per quantity
product_sales_by_income <- product_sales_by_income %>%
  mutate(sales_value_per_quantity = total_sales_value / total_quantity)

# Get top 5 products for each income level based on total sales value
top_products <- product_sales_by_income %>%
  group_by(income) %>%
  arrange(desc(total_sales_value)) %>%
  slice_head(n = 5)

# Create the plot
ggplot(top_products, aes(x = reorder(product_id, sales_value_per_quantity), 
                          y = sales_value_per_quantity, 
                          fill = income)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  labs(
    title = "Total Sales Value per Quantity for Top 5 Products by Income Level",
    subtitle = "The bar plot shows the sales value per quantity for the
    top 5 products purchased by each income level, allowing for insights
    into which products are generating higher revenue per item sold within
    different income brackets.", 
    x = "Product ID", 
    y = "Sales Value per Quantity ($)",
    fill = "Income Level"
  ) +
  theme_minimal(base_size = 8) +
  scale_y_continuous(labels = scales::dollar_format()) +
  coord_flip()