suppressMessages(library(completejourney)) # Analyzes grocery shopping transaction data.
suppressMessages(library(dplyr)) # Simplifies data manipulation tasks.
suppressMessages(library(forcats)) # Works with categorical data and factors.
suppressMessages(library(ggplot2)) # Creates data visualizations.
suppressMessages(library(knitr)) # Integrates R code with markdown for reports.
suppressMessages(library(lubridate)) # Handles date and time data.
suppressMessages(library(readr)) # Reads and writes data quickly.
suppressMessages(library(stringr)) # Manipulates strings (text data).
suppressMessages(library(tidyr)) # Reshapes and tidies data.
suppressMessages(library(tidyverse)) # Collection of R packages for data science.
transactions <- get_transactions()
demographics
## # A tibble: 801 × 8
## household_id age income home_ownership marital_status household_size
## <chr> <ord> <ord> <ord> <ord> <ord>
## 1 1 65+ 35-49K Homeowner Married 2
## 2 1001 45-54 50-74K Homeowner Unmarried 1
## 3 1003 35-44 25-34K <NA> Unmarried 1
## 4 1004 25-34 15-24K <NA> Unmarried 1
## 5 101 45-54 Under 15K Homeowner Married 4
## 6 1012 35-44 35-49K <NA> Married 5+
## 7 1014 45-54 15-24K <NA> Married 4
## 8 1015 45-54 50-74K Homeowner Unmarried 1
## 9 1018 45-54 35-49K Homeowner Married 5+
## 10 1020 45-54 25-34K Homeowner Married 2
## # ℹ 791 more rows
## # ℹ 2 more variables: household_comp <ord>, kids_count <ord>
products
## # A tibble: 92,331 × 7
## product_id manufacturer_id department brand product_category product_type
## <chr> <chr> <chr> <fct> <chr> <chr>
## 1 25671 2 GROCERY Natio… FRZN ICE ICE - CRUSH…
## 2 26081 2 MISCELLANEOUS Natio… <NA> <NA>
## 3 26093 69 PASTRY Priva… BREAD BREAD:ITALI…
## 4 26190 69 GROCERY Priva… FRUIT - SHELF S… APPLE SAUCE
## 5 26355 69 GROCERY Priva… COOKIES/CONES SPECIALTY C…
## 6 26426 69 GROCERY Priva… SPICES & EXTRAC… SPICES & SE…
## 7 26540 69 GROCERY Priva… COOKIES/CONES TRAY PACK/C…
## 8 26601 69 DRUG GM Priva… VITAMINS VITAMIN - M…
## 9 26636 69 PASTRY Priva… BREAKFAST SWEETS SW GDS: SW …
## 10 26691 16 GROCERY Priva… PNT BTR/JELLY/J… HONEY
## # ℹ 92,321 more rows
## # ℹ 1 more variable: package_size <chr>
transactions
## # A tibble: 1,469,307 × 11
## household_id store_id basket_id product_id quantity sales_value retail_disc
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 900 330 31198570044 1095275 1 0.5 0
## 2 900 330 31198570047 9878513 1 0.99 0.1
## 3 1228 406 31198655051 1041453 1 1.43 0.15
## 4 906 319 31198705046 1020156 1 1.5 0.29
## 5 906 319 31198705046 1053875 2 2.78 0.8
## 6 906 319 31198705046 1060312 1 5.49 0.5
## 7 906 319 31198705046 1075313 1 1.5 0.29
## 8 1058 381 31198676055 985893 1 1.88 0.21
## 9 1058 381 31198676055 988791 1 1.5 1.29
## 10 1058 381 31198676055 9297106 1 2.69 0
## # ℹ 1,469,297 more rows
## # ℹ 4 more variables: coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>
The business problem I am trying to solve is how does the store maximize revenues from frequent and high spending customer demographics.
The CEO should be interested in this data analysis is because reporting on findings in the data set and extrapolating that into actionable ideas can greatly improve profit for the store, and direct the store’s focus into what can be improved for the future.
My proposed solution for the CEO consists of making changes to 2 areas of marketing and prices. For marketing, the age group of 45-54 is the most frequent customers, but the departments of meat, deli, and pastry were lacking. Concerted efforts into improvements and sales could see a rise in business for these departments from the store’s highest revenue generating age group of customers. Secondly, produce prices should increase, due to those with high incomes being the primary purchasers of food from this department. The prices can increase because the most frequent buyers have more money to spend on quality produce.
I took the complete journey data set to look at which age demographic gains the store the most total revenue. I chose total revenue instead of median revenue because a sum is more relevant when determining market share. From there, looking at which departments were most often shopped at can give more of an insight into what can be focused upon, and the lacking departments could see an increase in attention from marketing and promotion efforts. For the high income portion of the analysis I looked at customers who earned between 125-149K, and 150-175K, since this income range is somewhat common but still a higher end shopper at the store. From there I ranked the top 7 departments that food was purchased from to uncover the trend of produce being the most purchased department of food.
merged_data <- transactions %>%
left_join(demographics, by = "household_id")
# Calculate revenue for each demographic group
merged_data <- merged_data %>%
mutate(revenue = sales_value * quantity) #Determining total revenue
revenue_by_demographics <- merged_data %>%
group_by(age, product_id) %>%
summarise(total_revenue = sum(revenue), .groups = "drop")
# Load necessary library
suppressMessages(library(scales))
revenue_by_demographics_clean <- revenue_by_demographics %>%
drop_na(total_revenue, age) # Drop rows where total_revenue or age is NA
ggplot(revenue_by_demographics_clean, aes(x = age, y = total_revenue)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = "Total Revenue by Age Group", x = "Age Group", y = "Total Revenue") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + # Rotate x-axis labels
scale_y_continuous(labels = label_number()) # Remove scientific notation
# Join transaction data with product data to get department information
transactions_with_departments <- transactions %>%
left_join(products, by = "product_id")
# Count purchases by department
department_purchase_counts <- transactions_with_departments %>%
count(department) %>%
arrange(desc(n)) # Sort by the number of purchases (n)
# Filter out 'grocery' and get the top 10 departments
top_departments <- department_purchase_counts %>%
filter(department != "GROCERY") %>%
slice_max(order_by = n, n = 10) %>% # Select top 10 based on purchase count (n)
arrange(desc(n)) # Sort in descending order
# Plot the top 10 most purchased departments (excluding grocery column)
ggplot(top_departments, aes(x = reorder(department, n), y = n, fill = department)) +
geom_bar(stat = "identity") +
labs(title = "Product Departments Purchased Age 45-54",
x = "Product Department",
y = "Purchase Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Using this chart, we can see that drug GM, produce, and packaged meat are the top 3 most purchased departments. Meat, deli, and pastry are also common departments for purchases in the 45-54 age range, but a steep drop off suggests that efforts can be made into marketing and creating discounts for these departments.
Next, I am going to evaluate those with relatively higher income ranges, specifically 125-149K, and 150-174K earners. Understanding this demographic is very important to unlocking more profits since they have the finances to afford quality groceries and are usually willing to pay a premium for the products they prefer.
# Group by income and product_id to calculate total revenue
revenue_by_income <- merged_data %>%
group_by(income, product_id) %>%
summarise(total_revenue = sum(revenue), .groups = "drop")
# Clean the data by dropping rows with NA values in income or total_revenue
revenue_by_income_clean <- revenue_by_income %>%
drop_na(total_revenue, income)
# Plot total revenue by income ranges
ggplot(revenue_by_income_clean, aes(x = income, y = total_revenue)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = "Total Revenue by Income Group", x = "Income Group", y = "Total Revenue") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + # Rotate x-axis labels
scale_y_continuous(labels = label_number()) # Remove scientific notation
# Join transactions with demographics to include income group information
transactions_with_demographics <- transactions %>%
left_join(demographics, by = "household_id")
# Join with products data to get department information
transactions_with_departments <- transactions_with_demographics %>%
left_join(products, by = "product_id")
# Count purchases by department and income
department_purchase_counts_income <- transactions_with_departments %>%
count(department, income) %>%
arrange(desc(n)) # Sort by the number of purchases (n)
# Filter for the income ranges: 125-149K, and 150-174K
filtered_income_groups <- department_purchase_counts_income %>%
filter(income %in% c("125-149K", "150-174K"))
# Filter out grocery column and get the top 7 departments for each income range
top_departments_income <- filtered_income_groups %>%
filter(department != "GROCERY") %>%
group_by(income) %>%
slice_max(order_by = n, n = 7) %>% # Select top 7 based on purchase count (n)
arrange(desc(n))
# Plot the top 7 departments purchased for each income group with small multiples
ggplot(top_departments_income, aes(x = reorder(department, n), y = n)) +
geom_bar(stat = "identity", fill = "steelblue") +
facet_wrap(~ income) + # Small multiples for income groups
labs(title = "Top 10 Most Purchased Departments by Income Group",
x = "Product Department",
y = "Purchase Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels
As shown in this graph, the higher income levels purchase produce much more often compared to the average consumer in the 45-54 age range. This is potentially due to this bracket having more money to spend on quality foods such as produce. Understanding this trend that as income levels rise, the consumers are purchasing fresh produce more frequently. My solution is for the store to increase the price on produce, since their most desired customers for this department have a higher income and would be willing to spend more on a high quality product.
In Conclusion, my target groups for the store to consider adapting their efforts for are the age group of 45-54 and the income ranges from 125K-175K. Understanding the purchasing trends of these segments has led me to suggest that the store creates slight discounts on meat, deli, and pastry departments to encourage the 45-54 year old customers to purchase more frequently. Also, for the higher income customers, increasing the price on produce is beneficial since it is the most popular department in terms of purchase frequency.