knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(completejourney)
library(ggplot2)
library(lubridate)
library(knitr)
# Load the data
transactions <- get_data("transactions")
data("demographics")
# Define income ranges and categories
income_ranges <- list(
"Under 25K" = c("Under 15K", "15-24K"),
"25K - 49K" = c("25-34K", "35-49K"),
"50K - 74K" = c("50-74K"),
"75K and above" = c("75-99K", "100K+")
)
# Function to categorize income
categorize_income <- function(income) {
for (category in names(income_ranges)) {
if (income %in% income_ranges[[category]]) {
return(category)
}
}
return("Unknown")
}
# Apply the function to the income column
demographics$income_category <- sapply(demographics$income, categorize_income)
# Join transactions and demographics data
transactions_demographics <- transactions %>%
inner_join(demographics, by = "household_id")
# Calculate the probability of any purchase for each income group over time
probability_data <- transactions_demographics %>%
mutate(month = floor_date(transaction_timestamp, "month")) %>%
group_by(income_category, month) %>%
summarize(purchase_count = n()) %>%
group_by(income_category) %>%
mutate(probability = purchase_count / sum(purchase_count))
# Create the plot with trendline
plot1 <- ggplot(probability_data, aes(x = month, y = probability, color = income_category)) +
geom_line() +
geom_smooth(method = "lm", se = FALSE) + # Add trendline
labs(title = "Probability of Any Purchase by Income Group Over Time",
x = "Month",
y = "Probability",
color = "Income Group") +
theme_minimal()
#Plot 2: Difference in Average Sales Value by Income Group
# Load the data
transactions <- get_data("transactions")
data("demographics")
# Define income ranges and categories
income_ranges <- list(
"Under 25K" = c("Under 15K", "15-24K"),
"25K - 49K" = c("25-34K", "35-49K"),
"50K - 74K" = c("50-74K"),
"75K and above" = c("75-99K", "100K+")
)
# Function to categorize income
categorize_income <- function(income) {
for (category in names(income_ranges)) {
if (income %in% income_ranges[[category]]) {
return(category)
}
}
return("Unknown")
}
# Apply the function to the income column
demographics$income_category <- sapply(demographics$income, categorize_income)
# Join transactions and demographics data
transactions_demographics <- transactions %>%
inner_join(demographics, by = "household_id")
# Calculate the average sales value for each income group
avg_sales_value <- transactions_demographics %>%
group_by(income_category) %>%
summarize(avg_sales_value = mean(sales_value, na.rm = TRUE))
# Calculate the difference from the overall average
overall_avg_sales_value <- mean(transactions$sales_value, na.rm = TRUE)
avg_sales_value <- avg_sales_value %>%
mutate(difference = avg_sales_value - overall_avg_sales_value)
# Create the diverging bar plot
plot2 <- ggplot(avg_sales_value, aes(x = reorder(income_category, difference), y = difference, fill = difference > 0)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Difference in Average Sales Value by Income Group",
x = "Income Group",
y = "Difference from Overall Average",
fill = "Above Average") +
theme_minimal()
#Plot 3: Total Sales Value by Top 5 Product Categories and Age Category
# Load necessary libraries
library(tidyverse)
library(completejourney)
library(ggplot2)
library(data.table)
# Load the data
transactions <- get_data("transactions")
promotions <- get_data("promotions")
data("products")
data("demographics")
# Convert data to data.table for efficient manipulation
transactions <- as.data.table(transactions)
products <- as.data.table(products)
promotions <- as.data.table(promotions)
demographics <- as.data.table(demographics)
# Reduce the size of the datasets by selecting only necessary columns
transactions_small <- transactions[, .(household_id, product_id, sales_value, quantity)]
demographics_small <- demographics[, .(household_id, age, income)]
products_small <- products[, .(product_id, product_category)]
# Sample a smaller subset of the data to reduce memory usage
set.seed(123) # For reproducibility
sample_size <- 100000 # Adjust the sample size as needed
transactions_sample <- transactions_small[sample(.N, sample_size)]
# Ensure transactions_sample is a data.table
transactions_sample <- as.data.table(transactions_sample)
# Join transactions and products data
transactions_products <- transactions_sample[products_small, on = "product_id"]
# Join transactions and demographics data
transactions_demographics <- transactions_sample[demographics_small, on = "household_id"]
# Define age ranges and categories
age_ranges <- list(
"Under 25" = c("18-24"),
"25-34" = c("25-34"),
"35-44" = c("35-44"),
"45-54" = c("45-54"),
"55-64" = c("55-64"),
"65 and above" = c("65+")
)
# Function to categorize age
categorize_age <- function(age) {
for (category in names(age_ranges)) {
if (age %in% age_ranges[[category]]) {
return(category)
}
}
return("Unknown")
}
# Apply the function to the age column
transactions_demographics[, age_category := sapply(age, categorize_age)]
# Aggregate transactions data to ensure a one-to-many relationship
transactions_agg <- transactions_products[, .(total_sales_value = sum(sales_value, na.rm = TRUE),
total_quantity = sum(quantity, na.rm = TRUE)),
by = .(household_id, product_id, product_category)]
# Join aggregated transactions with demographics data, allowing cartesian join
transactions_products_demographics <- transactions_agg[transactions_demographics, on = "household_id", allow.cartesian = TRUE]
# Remove rows with missing values in product_category or age_category
transactions_products_demographics <- transactions_products_demographics[!is.na(product_category) & !is.na(age_category)]
# Get the top 5 product categories by total sales value
top_5_product_categories <- transactions_products_demographics[, .(total_sales_value = sum(total_sales_value, na.rm = TRUE)),
by = product_category][order(-total_sales_value)][1:5, product_category]
# Filter the data to include only the top 5 product categories
filtered_data <- transactions_products_demographics[product_category %in% top_5_product_categories]
# Aggregate data to calculate total sales value by product category and age category
sales_by_product_age <- filtered_data[, .(total_sales_value = sum(total_sales_value, na.rm = TRUE)),
by = .(product_category, age_category)]
# Create the plot
plot3 <- ggplot(sales_by_product_age, aes(x = reorder(product_category, total_sales_value), y = total_sales_value, fill = age_category)) +
geom_bar(stat = "identity", position = "dodge") +
coord_flip() +
labs(title = "Total Sales Value by Top 5 Product Categories and Age Category",
x = "Product Category",
y = "Total Sales Value",
fill = "Age Category",
caption = "Data Source: completejourney") +
theme_minimal() +
theme(axis.text.y = element_text(size = 8, angle = 0, hjust = 1),
plot.title = element_text(size = 14, face = "bold"),
plot.caption = element_text(size = 10),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12))