# ==================== Data Preparation ====================
# -- Packages Used --
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(forcats)
library(ggrepel)
theme_set(
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", size = 16),
plot.subtitle = element_text(size = 11),
plot.caption = element_text(color = "grey40", size = 9),
legend.position = "top",
panel.grid.minor = element_blank()
)
)
# -- Main dataset used from Complete Journey --
cj <- transactions_sample %>%
inner_join(products, by = "product_id") %>%
inner_join(demographics, by = "household_id")
# -- Data Set #1 --
kids_week <- cj %>%
mutate(with_kids = kids_count != "0",
group = if_else(with_kids, "Families with Kids", "Families without Kids")) %>%
group_by(week, group) %>%
summarise(total_sales = sum(sales_value), .groups = "drop")
# -- Plotting Data Set #1 --
ggplot(kids_week, aes(week, total_sales, color = group)) +
geom_line(size = 0.9, alpha = 0.85) +
geom_smooth(se = FALSE, method = "loess", span = 0.25, linewidth = 1) +
annotate("rect", xmin = 32, xmax = 36, ymin = -Inf, ymax = Inf, alpha = .06) +
annotate("rect", xmin = 47, xmax = 52, ymin = -Inf, ymax = Inf, alpha = .06) +
annotate("text", x = 34, y = max(kids_week$total_sales)*.98, label = "Back-to-School Season", vjust = 1, size = 3.3) +
annotate("text", x = 49.5, y = max(kids_week$total_sales)*.98, label = "Holiday Season", vjust = 1, size = 3.3) +
scale_y_continuous(labels = dollar) +
scale_x_continuous(breaks = seq(1, 53, 4)) +
labs(
title = "Comparing Weekly Household Spending for Families with and without Kids",
subtitle = "Data represents the total weekly sales across all baskets during the year 2017.
The shaded bands highlight back-to-school and holiday periods.",
x = "Week of Year", y = "Total Sales ($)",
color = NULL,
caption = "Source: completejourney"
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
# Plot #2
# -- Data Set #2 --
cj_prod_family <- transactions_sample %>%
inner_join(products, by = "product_id") %>%
inner_join(demographics, by = "household_id") %>%
mutate(family_type = if_else(kids_count == "0", "Families without Kids", "Families with Kids")) %>%
group_by(family_type, product_type) %>%
summarise(total_sales = sum(sales_value), .groups = "drop")
# --Plotting Data Set #2 --
make_top5_plot <- function(df, family_label, fill_color) {
df %>%
filter(family_type == family_label) %>%
slice_max(total_sales, n = 5, with_ties = FALSE) %>%
mutate(product_type = fct_reorder(product_type, total_sales)) %>%
ggplot(aes(x = total_sales, y = product_type)) +
geom_col(fill = fill_color, width = 0.7) +
geom_text(aes(label = dollar(round(total_sales, 0))),
hjust = -0.05, size = 3.2) +
scale_x_continuous(labels = dollar, expand = expansion(mult = c(0, 0.18))) +
labs(
title = paste0("Top 5 Products by Total Sales: ", family_label),
subtitle = "Total sales for product per group ranked during the year 2017",
x = "Total Sales ($)", y = NULL,
caption = "Source: completejourney"
) +
theme_minimal(base_size = 12) +
theme(panel.grid.minor = element_blank())
}
p_with_kids <- make_top5_plot(cj_prod_family, "Families with Kids", "#377eb8")
p_no_kids <- make_top5_plot(cj_prod_family, "Families without Kids", "#e41a1c")
p_with_kids
p_no_kids
# -- Data Set #3 --
dept_sales <- transactions_sample %>%
inner_join(products, by = "product_id") %>%
inner_join(demographics, by = "household_id") %>%
mutate(family_type = if_else(kids_count == "0", "No Kids", "With Kids")) %>%
group_by(family_type, department) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE), .groups = "drop")
top_depts <- dept_sales %>%
group_by(department) %>%
summarise(overall_sales = sum(total_sales)) %>%
slice_max(order_by = overall_sales, n = 10)
dept_top <- dept_sales %>%
filter(department %in% top_depts$department)
# -- Plotting Data Set #3 --
ggplot(dept_top, aes(x = reorder(department, total_sales), y = total_sales, fill = family_type)) +
geom_col(position = "dodge") +
coord_flip() +
scale_y_continuous(labels = dollar) +
scale_fill_manual(values = c("No Kids" = "#e41a1c", "With Kids" = "#377eb8")) +
labs(
title = "Where Do Families Spend the Most? With vs. Without Kids",
subtitle = "Top 10 departments ranked by total 2017 sales value",
x = NULL, y = "Total Sales ($)",
fill = "Household type",
caption = "Source: completejourney"
) +
theme_minimal(base_size = 13) +
theme(
legend.position = "top",
plot.title = element_text(face = "bold"),
axis.text.y = element_text(face = "bold")
)