library(ggplot2)
library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(RColorBrewer)
library(viridisLite)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(scales)
library(knitr)
library(rmarkdown)
products
## # A tibble: 92,331 × 7
## product_id manufacturer_id department brand product_category product_type
## <chr> <chr> <chr> <fct> <chr> <chr>
## 1 25671 2 GROCERY Natio… FRZN ICE ICE - CRUSH…
## 2 26081 2 MISCELLANEOUS Natio… <NA> <NA>
## 3 26093 69 PASTRY Priva… BREAD BREAD:ITALI…
## 4 26190 69 GROCERY Priva… FRUIT - SHELF S… APPLE SAUCE
## 5 26355 69 GROCERY Priva… COOKIES/CONES SPECIALTY C…
## 6 26426 69 GROCERY Priva… SPICES & EXTRAC… SPICES & SE…
## 7 26540 69 GROCERY Priva… COOKIES/CONES TRAY PACK/C…
## 8 26601 69 DRUG GM Priva… VITAMINS VITAMIN - M…
## 9 26636 69 PASTRY Priva… BREAKFAST SWEETS SW GDS: SW …
## 10 26691 16 GROCERY Priva… PNT BTR/JELLY/J… HONEY
## # ℹ 92,321 more rows
## # ℹ 1 more variable: package_size <chr>
transactions <- transactions_sample
summary(transactions_sample$transaction_timestamp)
## Min. 1st Qu.
## "2017-01-01 07:30:27.0000" "2017-04-01 13:23:49.0000"
## Median Mean
## "2017-07-02 11:43:04.0000" "2017-07-02 09:39:57.5612"
## 3rd Qu. Max.
## "2017-10-02 16:28:50.2500" "2017-12-31 22:47:38.0000"
summary(transactions_sample$day_of_week)
## Warning: Unknown or uninitialised column: `day_of_week`.
## Length Class Mode
## 0 NULL NULL
summary(demographics$income)
## Under 15K 15-24K 25-34K 35-49K 50-74K 75-99K 100-124K 125-149K
## 61 74 77 172 192 96 34 38
## 150-174K 175-199K 200-249K 250K+
## 30 11 5 11
sales_data <- demographics %>%
filter(kids_count >= 2) %>%
inner_join(transactions, by = "household_id") %>%
group_by(kids_count) %>%
summarize(
total_sales = sum(sales_value, na.rm = TRUE),
avg_sales_per_household = mean(sales_value, na.rm = TRUE),
total_transactions = n(),
num_households = n_distinct(household_id),
.groups = "drop")
ggplot(sales_data, aes(x = factor(kids_count), y = total_sales, fill = kids_count)) +
geom_col(color = "black", show.legend = FALSE) +
geom_text(aes(label = scales::dollar(total_sales)), vjust = -0.5, size = 4) +
scale_y_continuous(labels = dollar) +
labs(
title = "Total Sales by Household Size (>= 2)",
subtitle = "2017 Grocery Data From Complete Journey",
x = "Number of Kids in Household",
y = "Total Sales ($)" ) +
theme_minimal() +
theme(axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10),
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12, face = "italic")) +
scale_fill_viridis_d()

sales_data %>%
arrange(desc(total_sales)) %>%
select(kids_count, num_households, total_sales, avg_sales_per_household, total_transactions)
## # A tibble: 2 × 5
## kids_count num_households total_sales avg_sales_per_household
## <ord> <int> <dbl> <dbl>
## 1 3+ 69 12738. 3.16
## 2 2 60 12196. 3.01
## # ℹ 1 more variable: total_transactions <int>
sales_data <- transactions %>%
inner_join(products, by = "product_id") %>%
inner_join(demographics, by = "household_id") %>%
mutate(marital_status = ifelse(is.na(marital_status), "Unknown", marital_status)) %>% # Handle NA values
group_by(brand, marital_status) %>%
summarize(
total_quantity = sum(quantity, na.rm = TRUE),
avg_quantity_per_purchase = mean(quantity, na.rm = TRUE),
total_sales_value = sum(sales_value, na.rm = TRUE),
num_transactions = n(),
.groups = "drop" )
ggplot(sales_data, aes(x = marital_status, y = total_quantity, color = brand)) +
geom_point(aes(size = total_sales_value), alpha = 0.7) + # Size represents total sales value
geom_text(aes(label = scales::comma(total_quantity)), vjust = -0.5, size = 3.5, check_overlap = TRUE) +
facet_wrap(~ brand, scales = "free_y") + # Separate plots for each brand
scale_y_log10(labels = scales::comma) +
scale_color_manual(values = c("red", "blue", "green", "purple", "orange", "brown")) + # Customize colors
labs(
title = "Total Quantity Purchased vs. Marital Status by Brand",
subtitle = "Analysis of Product Sales Across Household Marital Status",
x = "Marital Status",
y = "Total Quantity Purchased (Log Scale)",
size = "Total Sales ($)",
color = "Brand") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12, face = "italic"),
legend.position = "right")

sales_data %>%
arrange(desc(total_quantity)) %>%
select(brand, marital_status, total_quantity, avg_quantity_per_purchase, total_sales_value, num_transactions)
## # A tibble: 6 × 6
## brand marital_status total_quantity avg_quantity_per_pur…¹ total_sales_value
## <fct> <chr> <dbl> <dbl> <dbl>
## 1 Private 1 2547669 500. 17398.
## 2 Private 2 1662707 356. 14201.
## 3 Private Unknown 708589 326. 6478.
## 4 Nation… 1 17992 1.30 44513.
## 5 Nation… 2 14606 1.30 35002.
## 6 Nation… Unknown 6822 1.35 16022.
## # ℹ abbreviated name: ¹avg_quantity_per_purchase
## # ℹ 1 more variable: num_transactions <int>
data_merged <- transactions_sample %>%
left_join(products, by = "product_id") %>%
left_join(demographics, by = "household_id")
income_levels <- c("Under 15K", "15-24K", "25-34K", "35-49K", "50-74K", "75-99K", "100-124K", "125-149K", "150-174K", "175-199K")
category_sales <- data_merged %>%
filter(income %in% income_levels) %>% # Filter income levels
group_by(product_category, income) %>%
summarise(total_sales_value = sum(sales_value, na.rm = TRUE), .groups = 'drop')
top_categories <- category_sales %>%
group_by(product_category) %>%
summarise(total_sales_value = sum(total_sales_value, na.rm = TRUE)) %>%
arrange(desc(total_sales_value)) %>%
slice_head(n = 5) %>% # Select top 5 categories
pull(product_category)
filtered_data <- category_sales %>%
filter(product_category %in% top_categories)
ggplot(filtered_data, aes(x = reorder(product_category, -total_sales_value),
y = total_sales_value, fill = income)) +
geom_col(position = position_dodge(width = 0.8)) +
labs(title = "Top 5 Product Categories vs Household Income Levels (< 199K)",
subtitle = "Total sales value for top product categories across income levels",
x = "Product Category",
y = "Total Sales Value",
fill = "Income Level") +
scale_y_continuous(limits = c(0, max(filtered_data$total_sales_value) * 1.1)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_brewer(palette = "Paired")
