Plot 1: Probability of Any Purchase by Income Group Over Time

# Load the data
transactions <- get_data("transactions")
data("demographics")

# Define income ranges and categories
income_ranges <- list(
  "Under 25K" = c("Under 15K", "15-24K"),
  "25K - 49K" = c("25-34K", "35-49K"),
  "50K - 74K" = c("50-74K"),
  "75K and above" = c("75-99K", "100K+")
)

# Function to categorize income
categorize_income <- function(income) {
  for (category in names(income_ranges)) {
    if (income %in% income_ranges[[category]]) {
      return(category)
    }
  }
  return("Unknown")
}

# Apply the function to the income column
demographics$income_category <- sapply(demographics$income, categorize_income)

# Join transactions and demographics data
transactions_demographics <- transactions %>%
  inner_join(demographics, by = "household_id")

# Calculate the probability of any purchase for each income group over time
probability_data <- transactions_demographics %>%
  mutate(month = floor_date(transaction_timestamp, "month")) %>%
  group_by(income_category, month) %>%
  summarize(purchase_count = n()) %>%
  group_by(income_category) %>%
  mutate(probability = purchase_count / sum(purchase_count))

# Create the plot with trendline
plot1 <- ggplot(probability_data, aes(x = month, y = probability, color = income_category)) +
  geom_line() +
  geom_smooth(method = "lm", se = FALSE) +  # Add trendline
  labs(title = "Probability of Any Purchase by Income Group Over Time",
       x = "Month",
       y = "Probability",
       color = "Income Group") +
  theme_minimal()

#Plot 2: Difference in Average Sales Value by Income Group

# Load the data
transactions <- get_data("transactions")
data("demographics")

# Define income ranges and categories
income_ranges <- list(
  "Under 25K" = c("Under 15K", "15-24K"),
  "25K - 49K" = c("25-34K", "35-49K"),
  "50K - 74K" = c("50-74K"),
  "75K and above" = c("75-99K", "100K+")
)

# Function to categorize income
categorize_income <- function(income) {
  for (category in names(income_ranges)) {
    if (income %in% income_ranges[[category]]) {
      return(category)
    }
  }
  return("Unknown")
}

# Apply the function to the income column
demographics$income_category <- sapply(demographics$income, categorize_income)

# Join transactions and demographics data
transactions_demographics <- transactions %>%
  inner_join(demographics, by = "household_id")

# Calculate the average sales value for each income group
avg_sales_value <- transactions_demographics %>%
  group_by(income_category) %>%
  summarize(avg_sales_value = mean(sales_value, na.rm = TRUE))

# Calculate the difference from the overall average
overall_avg_sales_value <- mean(transactions$sales_value, na.rm = TRUE)
avg_sales_value <- avg_sales_value %>%
  mutate(difference = avg_sales_value - overall_avg_sales_value)

# Create the diverging bar plot
plot2 <- ggplot(avg_sales_value, aes(x = reorder(income_category, difference), y = difference, fill = difference > 0)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Difference in Average Sales Value by Income Group",
       x = "Income Group",
       y = "Difference from Overall Average",
       fill = "Above Average") +
  theme_minimal()

#Plot 3: Total Sales Value by Top 5 Product Categories and Age Category

# Load necessary libraries
library(tidyverse)
library(completejourney)
library(ggplot2)
library(data.table)

# Load the data
transactions <- get_data("transactions")
promotions <- get_data("promotions")
data("products")
data("demographics")

# Convert data to data.table for efficient manipulation
transactions <- as.data.table(transactions)
products <- as.data.table(products)
promotions <- as.data.table(promotions)
demographics <- as.data.table(demographics)

# Reduce the size of the datasets by selecting only necessary columns
transactions_small <- transactions[, .(household_id, product_id, sales_value, quantity)]
demographics_small <- demographics[, .(household_id, age, income)]
products_small <- products[, .(product_id, product_category)]

# Sample a smaller subset of the data to reduce memory usage
set.seed(123)  # For reproducibility
sample_size <- 100000  # Adjust the sample size as needed
transactions_sample <- transactions_small[sample(.N, sample_size)]

# Ensure transactions_sample is a data.table
transactions_sample <- as.data.table(transactions_sample)

# Join transactions and products data
transactions_products <- transactions_sample[products_small, on = "product_id"]

# Join transactions and demographics data
transactions_demographics <- transactions_sample[demographics_small, on = "household_id"]

# Define age ranges and categories
age_ranges <- list(
  "Under 25" = c("18-24"),
  "25-34" = c("25-34"),
  "35-44" = c("35-44"),
  "45-54" = c("45-54"),
  "55-64" = c("55-64"),
  "65 and above" = c("65+")
)

# Function to categorize age
categorize_age <- function(age) {
  for (category in names(age_ranges)) {
    if (age %in% age_ranges[[category]]) {
      return(category)
    }
  }
  return("Unknown")
}

# Apply the function to the age column
transactions_demographics[, age_category := sapply(age, categorize_age)]

# Aggregate transactions data to ensure a one-to-many relationship
transactions_agg <- transactions_products[, .(total_sales_value = sum(sales_value, na.rm = TRUE),
                                              total_quantity = sum(quantity, na.rm = TRUE)),
                                          by = .(household_id, product_id, product_category)]

# Join aggregated transactions with demographics data, allowing cartesian join
transactions_products_demographics <- transactions_agg[transactions_demographics, on = "household_id", allow.cartesian = TRUE]

# Remove rows with missing values in product_category or age_category
transactions_products_demographics <- transactions_products_demographics[!is.na(product_category) & !is.na(age_category)]

# Get the top 5 product categories by total sales value
top_5_product_categories <- transactions_products_demographics[, .(total_sales_value = sum(total_sales_value, na.rm = TRUE)),
                                                               by = product_category][order(-total_sales_value)][1:5, product_category]

# Filter the data to include only the top 5 product categories
filtered_data <- transactions_products_demographics[product_category %in% top_5_product_categories]

# Aggregate data to calculate total sales value by product category and age category
sales_by_product_age <- filtered_data[, .(total_sales_value = sum(total_sales_value, na.rm = TRUE)),
                                      by = .(product_category, age_category)]

# Create the plot
plot3 <- ggplot(sales_by_product_age, aes(x = reorder(product_category, total_sales_value), y = total_sales_value, fill = age_category)) +
  geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  labs(title = "Total Sales Value by Top 5 Product Categories and Age Category",
       x = "Product Category",
       y = "Total Sales Value",
       fill = "Age Category",
       caption = "Data Source: completejourney") +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 8, angle = 0, hjust = 1),
        plot.title = element_text(size = 14, face = "bold"),
        plot.caption = element_text(size = 10),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12))

Week 5 Lab - Rice

Plot 1: Probability of Any Purchase by Income Group Over Time