This project aims to analyze shopping patterns using the CompleteJourney dataset. We focus on age groups, income levels, and seasonal trends to determine product demand variations.
“How do the shopping habits of different age groups affect spending across various food product categories, and how can Regork tailor its offerings and marketing strategies to increase sales in these categories?”
By understanding how distinct age demographics shop and what products they gravitate toward, we can create more targeted marketing strategies and fine-tune product offerings. My analysis aims to provide Regork with data-driven insights into which customer segments offer the most significant opportunities for growth and how best to tailor our approach to meet their needs.
The following R Packages are required in order to run the code in this R project:
# Load necessary libraries
library(dplyr) # Data manipulation
library(ggplot2) # Data visualization
library(scales) # Formatting scales (e.g., dollar_format)
library(lubridate) # Handling date-time data
library(tidyr) # Data tidying
library(completejourney) # Access to CompleteJourney dataset
library(htmltools) # HTML output formatting for reports
The data for this R project can be accessed from the CompleteJourney website.
# Load data
transactions <- completejourney::get_transactions()
products <- completejourney::products
demographics <- completejourney::demographics
# Convert 'product_id' to character
transactions <- transactions %>% mutate(product_id = as.character(product_id))
products <- products %>% mutate(product_id = as.character(product_id))
# Add month and season columns
transactions <- transactions %>%
mutate(month = format(transaction_timestamp, "%m")) %>%
mutate(season = case_when(
month %in% c("12", "01", "02") ~ "Winter",
month %in% c("03", "04", "05") ~ "Spring",
month %in% c("06", "07", "08") ~ "Summer",
month %in% c("09", "10", "11") ~ "Fall",
TRUE ~ NA_character_
))
grocery_categories <- completejourney::get_transactions() %>%
inner_join(completejourney::products, by = "product_id") %>%
filter(department == "GROCERY") %>% # Filter only the Grocery department
group_by(product_category) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
arrange(desc(total_sales)) %>%
slice_max(order_by = total_sales, n = 10) # Select the top 10 product categories
ggplot(grocery_categories, aes(x = reorder(product_category, total_sales), y = total_sales, fill = product_category)) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_y_continuous(labels = dollar_format()) +
labs(
title = "Top 10 Grocery Categories by Total Sales",
subtitle = "Product categories within the Grocery department with the highest revenue",
x = "Grocery Product Category",
y = "Total Sales ($)",
caption = "Data Source: CompleteJourney"
) +
theme_minimal()
grocery_19_34 <- completejourney::get_transactions() %>%
inner_join(completejourney::products, by = "product_id") %>%
inner_join(completejourney::demographics, by = "household_id") %>%
filter(department == "GROCERY", age %in% c("19-24", "25-34")) %>%
group_by(product_category) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
arrange(desc(total_sales)) %>%
slice_max(order_by = total_sales, n = 5)
ggplot(grocery_19_34, aes(x = reorder(product_category, total_sales), y = total_sales, fill = product_category)) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_y_continuous(labels = dollar_format()) +
labs(
title = "Top 5 Grocery Categories Purchased by Age Group 19-34",
subtitle = "Highest spending categories within the grocery department",
x = "Grocery Product Category",
y = "Total Sales ($)",
caption = "Data Source: CompleteJourney"
) +
theme_minimal()
grocery_35_54 <- completejourney::get_transactions() %>%
inner_join(completejourney::products, by = "product_id") %>%
inner_join(completejourney::demographics, by = "household_id") %>%
filter(department == "GROCERY", age %in% c("35-44", "45-54")) %>%
group_by(product_category) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
arrange(desc(total_sales)) %>%
slice_max(order_by = total_sales, n = 5)
ggplot(grocery_35_54, aes(x = reorder(product_category, total_sales), y = total_sales, fill = product_category)) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_y_continuous(labels = dollar_format()) +
labs(
title = "Top 5 Grocery Categories Purchased by Age Group 35-54",
subtitle = "Highest spending categories within the grocery department",
x = "Grocery Product Category",
y = "Total Sales ($)",
caption = "Data Source: CompleteJourney"
) +
theme_minimal()
grocery_55_plus <- completejourney::get_transactions() %>%
inner_join(completejourney::products, by = "product_id") %>%
inner_join(completejourney::demographics, by = "household_id") %>%
filter(department == "GROCERY", age %in% c("55-64", "65+")) %>%
group_by(product_category) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
arrange(desc(total_sales)) %>%
slice_max(order_by = total_sales, n = 5)
ggplot(grocery_55_plus, aes(x = reorder(product_category, total_sales), y = total_sales, fill = product_category)) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_y_continuous(labels = dollar_format()) +
labs(
title = "Top 5 Grocery Categories Purchased by Age Group 55+",
subtitle = "Highest spending categories within the grocery department",
x = "Grocery Product Category",
y = "Total Sales ($)",
caption = "Data Source: CompleteJourney"
) +
theme_minimal()
income_grocery_spending <- completejourney::get_transactions() %>%
inner_join(completejourney::products, by = "product_id") %>%
inner_join(completejourney::demographics, by = "household_id") %>%
filter(department == "GROCERY",
product_category %in% c("SOFT DRINKS", "CHEESE", "FLUID MILK PRODUCTS", "BEERS/ALES", "FROZEN PIZZA")) %>%
mutate(
income_group = case_when(
income %in% c("Under 15K", "15-24K", "25-34K", "35-49K") ~ "0-50K",
income %in% c("50-74K", "75-99K") ~ "50-100K",
income %in% c("100-124K", "125-149K", "150-174K", "175-199K", "200-249K", "250K+") ~ "100K+"
)
) %>%
group_by(income_group, product_category) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'income_group'. You can override using the
## `.groups` argument.
ggplot(income_grocery_spending, aes(x = reorder(product_category, total_sales),
y = total_sales, fill = income_group)) +
geom_col(position = "dodge") +
coord_flip() +
scale_y_continuous(labels = dollar_format()) +
labs(
title = "Spending on Key Grocery Categories by Income Group",
subtitle = "Comparing total spending across Soft Drinks, Cheese, Fluid Milk, Beers/Ales, Frozen Pizza",
x = "Grocery Product Category",
y = "Total Sales ($)",
fill = "Income Group",
caption = "Data Source: CompleteJourney"
) +
theme_minimal()
income_category_spending <- completejourney::get_transactions() %>%
inner_join(completejourney::products, by = "product_id") %>%
inner_join(completejourney::demographics, by = "household_id") %>%
mutate(
income_group = case_when(
income %in% c("Under 15K", "15-24K", "25-34K", "35-49K", "50-74K", "75-99K") ~ "Below_100K",
income %in% c("100-124K", "125-149K", "150-174K", "175-199K", "200-249K", "250K+") ~ "Above_100K"
)
) %>%
group_by(income_group, product_category) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'income_group'. You can override using the
## `.groups` argument.
# Transform data to compare the spending difference
income_spending_diff <- income_category_spending %>%
pivot_wider(names_from = income_group, values_from = total_sales, values_fill = 0) %>%
mutate(difference = Above_100K - Below_100K) %>% # Fix: Use column names without `< >`
arrange(desc(difference)) %>%
slice_max(order_by = difference, n = 3) # Select **top 3** categories where high-income spends more
ggplot(income_spending_diff, aes(x = reorder(product_category, difference), y = difference, fill = difference)) +
geom_col() +
coord_flip() +
scale_y_continuous(labels = dollar_format()) +
scale_fill_gradient(low = "blue", high = "red") + # Color gradient to highlight categories
labs(
title = "Top 3 Product Categories Where High-Income Consumers Spend More",
subtitle = "Comparing spending of Above 100K vs. Below 100K income groups",
x = "Product Category",
y = "Spending Difference ($)",
caption = "Data Source: CompleteJourney"
) +
theme_minimal()
income_category_spending <- completejourney::get_transactions() %>%
inner_join(completejourney::products, by = "product_id") %>%
inner_join(completejourney::demographics, by = "household_id") %>%
mutate(
income_group = case_when(
income %in% c("Under 15K", "15-24K", "25-34K", "35-49K", "50-74K", "75-99K") ~ "Below_100K",
income %in% c("100-124K", "125-149K", "150-174K", "175-199K", "200-249K", "250K+") ~ "Above_100K"
)
) %>%
group_by(income_group, product_category) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'income_group'. You can override using the
## `.groups` argument.
# Transform data to compare the spending difference
low_income_spending_diff <- income_category_spending %>%
pivot_wider(names_from = income_group, values_from = total_sales, values_fill = 0) %>%
mutate(difference = Below_100K - Above_100K) %>% # Reverse difference for **low-income focus**
arrange(desc(difference)) %>%
slice_max(order_by = difference, n = 3) # Select **top 3 categories** where **low-income** spends more
ggplot(low_income_spending_diff, aes(x = reorder(product_category, difference), y = difference, fill = difference)) +
geom_col() +
coord_flip() +
scale_y_continuous(labels = dollar_format()) +
scale_fill_gradient(low = "blue", high = "red") + # Color gradient to highlight categories
labs(
title = "Top 3 Product Categories Where Low-Income Consumers Spend More",
subtitle = "Comparing spending of Below 100K vs. Above 100K income groups",
x = "Product Category",
y = "Spending Difference ($)",
caption = "Data Source: CompleteJourney"
) +
theme_minimal()
seasonal_sales <- completejourney::get_transactions() %>%
inner_join(completejourney::products, by = "product_id") %>%
mutate(
season = case_when(
month(transaction_timestamp) %in% c(12, 1, 2) ~ "Winter",
month(transaction_timestamp) %in% c(3, 4, 5) ~ "Spring",
month(transaction_timestamp) %in% c(6, 7, 8) ~ "Summer",
month(transaction_timestamp) %in% c(9, 10, 11) ~ "Fall"
)
) %>%
group_by(product_category, season) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'product_category'. You can override using
## the `.groups` argument.
seasonal_variation <- seasonal_sales %>%
group_by(product_category) %>%
summarise(
mean_sales = mean(total_sales, na.rm = TRUE),
sd_sales = sd(total_sales, na.rm = TRUE), # Standard deviation
cv = sd_sales / mean_sales # Coefficient of variation (CV)
) %>%
filter(!is.na(cv) & mean_sales > 100) %>% # Avoid errors for small-sale items
arrange(desc(cv)) %>%
slice_max(order_by = cv, n = 10) # Get top 10 most seasonal items
ggplot(seasonal_variation, aes(x = reorder(product_category, cv), y = cv, fill = cv)) +
geom_col() +
coord_flip() +
scale_y_continuous(labels = percent_format(scale = 1)) + # Show % variation
scale_fill_gradient(low = "blue", high = "red") + # Highlight seasonal products
labs(
title = "Top 10 Most Seasonal Product Categories",
subtitle = "Products with the highest sales variation across seasons",
x = "Product Category",
y = "Seasonal Variation (%)",
caption = "Data Source: CompleteJourney"
) +
theme_minimal()
# Load transactions data
transactions <- completejourney::get_transactions()
# Load the products data
products <- completejourney::products
library(ggplot2)
library(dplyr)
# Function to get top 10 product types for a given product category
get_top_product_graph <- function(category_name) {
top_products <- transactions %>%
inner_join(products, by = "product_id") %>%
filter(product_category == category_name) %>%
group_by(product_type) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE)) %>%
arrange(desc(total_sales)) %>%
slice_max(order_by = total_sales, n = 10) # Select top 10
ggplot(top_products, aes(x = reorder(product_type, total_sales), y = total_sales, fill = product_type)) +
geom_col(show.legend = FALSE) +
coord_flip() + # Flip for better readability
labs(
title = paste("Top 10 Product Types for", category_name),
x = "Product Type",
y = "Total Sales ($)",
caption = "Data Source: CompleteJourney"
) +
theme_minimal()
}
# Generate graphs for each category
valentine_graph <- get_top_product_graph("VALENTINE")
halloween_graph <- get_top_product_graph("HALLOWEEN")
easter_graph <- get_top_product_graph("EASTER")
# Display the graphs
print(valentine_graph)
print(halloween_graph)
print(easter_graph)
Young Adults (19-34): Focus marketing on ready-to-eat meals, beverages, and snack foods. Use social media promotions and in-store deals for impulse purchases.
Middle-Aged Consumers (35-54): Highlight dairy products, meats, and home-cooking essentials. Implement loyalty programs for bulk purchases.
Older Consumers (55+): Promote nutritious, easy-to-prepare meals with discounts on staple products like beef and dairy.