# ==================== Data Preparation ====================
# -- Packages Used --
library(tidyverse) 
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(completejourney) 
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(forcats) 
library(ggrepel)

theme_set(
  theme_minimal(base_size = 12) +
    theme(
      plot.title = element_text(face = "bold", size = 16),
      plot.subtitle = element_text(size = 11),
      plot.caption = element_text(color = "grey40", size = 9),
      legend.position = "top",
      panel.grid.minor = element_blank()
    )
)

# -- Main dataset used from Complete Journey --
cj <- transactions_sample %>%
  inner_join(products, by = "product_id") %>%
  inner_join(demographics, by = "household_id")

Plot #1

# -- Data Set #1 --
kids_week <- cj %>%
  mutate(with_kids = kids_count != "0",
         group = if_else(with_kids, "Families with Kids", "Families without Kids")) %>%
  group_by(week, group) %>%
  summarise(total_sales = sum(sales_value), .groups = "drop")

# -- Plotting Data Set #1 --
ggplot(kids_week, aes(week, total_sales, color = group)) +
  geom_line(size = 0.9, alpha = 0.85) +
  geom_smooth(se = FALSE, method = "loess", span = 0.25, linewidth = 1) +
  annotate("rect", xmin = 32, xmax = 36, ymin = -Inf, ymax = Inf, alpha = .06) +
  annotate("rect", xmin = 47, xmax = 52, ymin = -Inf, ymax = Inf, alpha = .06) +
  annotate("text", x = 34, y = max(kids_week$total_sales)*.98, label = "Back-to-School Season", vjust = 1, size = 3.3) +
  annotate("text", x = 49.5, y = max(kids_week$total_sales)*.98, label = "Holiday Season", vjust = 1, size = 3.3) +
  scale_y_continuous(labels = dollar) +
  scale_x_continuous(breaks = seq(1, 53, 4)) +
  labs(
    title = "Comparing Weekly Household Spending for Families with and without Kids",
    subtitle = "Data represents the total weekly sales across all baskets during the year 2017.
The shaded bands highlight back-to-school and holiday periods.",
    x = "Week of Year", y = "Total Sales ($)",
    color = NULL,
    caption = "Source: completejourney"
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

# Plot #2

# -- Data Set #2 --
cj_prod_family <- transactions_sample %>%
  inner_join(products, by = "product_id") %>%
  inner_join(demographics, by = "household_id") %>%
  mutate(family_type = if_else(kids_count == "0", "Families without Kids", "Families with Kids")) %>%
  group_by(family_type, product_type) %>%
  summarise(total_sales = sum(sales_value), .groups = "drop")

# --Plotting Data Set #2 --
make_top5_plot <- function(df, family_label, fill_color) {
  df %>%
    filter(family_type == family_label) %>%
    slice_max(total_sales, n = 5, with_ties = FALSE) %>%
    mutate(product_type = fct_reorder(product_type, total_sales)) %>%
    ggplot(aes(x = total_sales, y = product_type)) +
    geom_col(fill = fill_color, width = 0.7) +
    geom_text(aes(label = dollar(round(total_sales, 0))),
              hjust = -0.05, size = 3.2) +
    scale_x_continuous(labels = dollar, expand = expansion(mult = c(0, 0.18))) +
    labs(
      title = paste0("Top 5 Products by Total Sales: ", family_label),
      subtitle = "Total sales for product per group ranked during the year 2017",
      x = "Total Sales ($)", y = NULL,
      caption = "Source: completejourney"
    ) +
    theme_minimal(base_size = 12) +
    theme(panel.grid.minor = element_blank())
}

p_with_kids  <- make_top5_plot(cj_prod_family, "Families with Kids", "#377eb8")
p_no_kids    <- make_top5_plot(cj_prod_family, "Families without Kids",   "#e41a1c")

p_with_kids

p_no_kids

Plot #3

# -- Data Set #3 --
dept_sales <- transactions_sample %>%
  inner_join(products, by = "product_id") %>%
  inner_join(demographics, by = "household_id") %>%
  mutate(family_type = if_else(kids_count == "0", "No Kids", "With Kids")) %>%
  group_by(family_type, department) %>%
  summarise(total_sales = sum(sales_value, na.rm = TRUE), .groups = "drop")

top_depts <- dept_sales %>%
  group_by(department) %>%
  summarise(overall_sales = sum(total_sales)) %>%
  slice_max(order_by = overall_sales, n = 10)

dept_top <- dept_sales %>%
  filter(department %in% top_depts$department)

# -- Plotting Data Set #3 --
ggplot(dept_top, aes(x = reorder(department, total_sales), y = total_sales, fill = family_type)) +
  geom_col(position = "dodge") +
  coord_flip() +
  scale_y_continuous(labels = dollar) +
  scale_fill_manual(values = c("No Kids" = "#e41a1c", "With Kids" = "#377eb8")) +
  labs(
    title = "Where Do Families Spend the Most? With vs. Without Kids",
    subtitle = "Top 10 departments ranked by total 2017 sales value",
    x = NULL, y = "Total Sales ($)",
    fill = "Household type",
    caption = "Source: completejourney"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    legend.position = "top",
    plot.title = element_text(face = "bold"),
    axis.text.y = element_text(face = "bold")
  )