1. Plot: Top 5 products by households with and without children

# Load required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(completejourney) # Assuming the completejourney package is loaded
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
library(ggplot2)
library(dplyr)

library(completejourney)
data(transactions)
# Load datasets from the completejourney package
transactions <- get_transactions()
# transactions <- completejourney::transactions
demographics <- completejourney::demographics
products <- completejourney::products

# 1. Plot: Top 5 products by households with and without children

# Data preparation
child_status_sales <- transactions %>%
  left_join(demographics, by = "household_id") %>%
  left_join(products, by = "product_id") %>%
  filter(!is.na(kids_count)) %>%
  group_by(kids_count, product_category) %>%
  summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
  top_n(5, total_sales) %>%
  arrange(kids_count, desc(total_sales))
## `summarise()` has grouped output by 'kids_count'. You can override using the
## `.groups` argument.
# Create plot
ggplot(child_status_sales, aes(x = reorder(product_category, total_sales), y = total_sales, fill = as.factor(kids_count))) +
  geom_col(position = "dodge") +
  coord_flip() +
  labs(title = "Top 5 Products for Households with and without Kids",
       subtitle = "Comparison of sales value for top 5 products",
       x = "Product Category", y = "Total Sales (USD)", fill = "Kids Status",
       caption = "Data: CompleteJourney") +
  theme_minimal()

2. Plot: Total Sales Value by Age Group

# 2. Plot: Total Sales Value by Age Group

# Data preparation
age_group_sales <- transactions %>%
  left_join(demographics, by = "household_id") %>%
  left_join(products, by = "product_id") %>%
  group_by(age) %>%  # Assuming 'age_group' is the name of the column representing age groups
  summarise(total_sales = sum(sales_value, na.rm = TRUE), .groups = 'drop') %>%
  arrange(desc(total_sales))  # Arrange by total sales

# Create plot
ggplot(age_group_sales, aes(x = reorder(age, total_sales), y = total_sales, fill = age)) +
  geom_col() +  # Create a bar chart
  coord_flip() +  # Flip coordinates for better readability
  labs(
    title = "Total Sales Value by Age Group",
    subtitle = "Analysis of total sales across different age groups",
    x = "Age Group",
    y = "Total Sales Value (USD)",
    fill = "Age Group",
    caption = "Data: CompleteJourney"
  ) +
  theme_minimal() +  # Use a minimal theme
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Adjust x-axis text if needed

3. Plot: Top 5 products by income level

# 3.Data preparation: Top 5 products by income level
income_level_top_products <- transactions %>%
  left_join(demographics, by = "household_id") %>%
  left_join(products, by = "product_id") %>%
  group_by(income, product_category) %>%
  summarise(total_sales = sum(sales_value, na.rm = TRUE), total_quantity = sum(quantity, na.rm = TRUE), .groups = 'drop') %>%
  arrange(income, desc(total_sales)) %>%
  group_by(income) %>%
  top_n(5, total_sales) %>%
  ungroup() %>%
  mutate(sales_per_quantity = total_sales / total_quantity)

# Create plot: Total sales value per quantity for top 5 products in each income level
ggplot(income_level_top_products, aes(x = reorder(product_category, sales_per_quantity), y = sales_per_quantity, fill = income)) +
  geom_col(position = "dodge") +
  coord_flip() +
  labs(title = "Top 5 Products by Income Level",
       subtitle = "Total Sales Value per Quantity for Top 5 Products Purchased by Each Income Level",
       x = "Product Category", y = "Total Sales per Quantity (USD)", fill = "Income Level",
       caption = "Data: CompleteJourney") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

4. Plot: Purchase frequency vs. total spend by household size

# 4. Plot: Purchase frequency vs. total spend by household size

# Data preparation
household_size_sales <- transactions %>%
  left_join(demographics, by = "household_id") %>%
  group_by(household_size) %>%
  summarise(total_sales = sum(sales_value, na.rm = TRUE),
            purchase_frequency = n()) %>%
  arrange(desc(total_sales))

# Create plot
ggplot(household_size_sales, aes(x = purchase_frequency, y = total_sales, color = as.factor(household_size))) +
  geom_point(size = 3) +
  labs(title = "Purchase Frequency vs Total Spend by Household Size",
       subtitle = "Relationship between how often households purchase and how much they spend",
       x = "Purchase Frequency", y = "Total Sales (USD)", color = "Household Size",
       caption = "Data: CompleteJourney") +
  theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

5. Plot: Sales distribution by household income level

# 5. Plot: Sales distribution by household income level
# Data preparation
income_sales <- transactions %>%
  left_join(demographics, by = "household_id") %>%
  group_by(income) %>%
  summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
  arrange(desc(total_sales))

# Create plot
ggplot(income_sales, aes(x = reorder(income, total_sales), y = total_sales)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "Total Sales by Income Level",
       subtitle = "Distribution of total sales value across different income levels",
       x = "Income Level", y = "Total Sales (USD$)",
       caption = "Data: CompleteJourney") +
  theme_minimal()