library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(dplyr)

transactions <- transactions_sample
products
## # A tibble: 92,331 × 7
##    product_id manufacturer_id department    brand  product_category product_type
##    <chr>      <chr>           <chr>         <fct>  <chr>            <chr>       
##  1 25671      2               GROCERY       Natio… FRZN ICE         ICE - CRUSH…
##  2 26081      2               MISCELLANEOUS Natio… <NA>             <NA>        
##  3 26093      69              PASTRY        Priva… BREAD            BREAD:ITALI…
##  4 26190      69              GROCERY       Priva… FRUIT - SHELF S… APPLE SAUCE 
##  5 26355      69              GROCERY       Priva… COOKIES/CONES    SPECIALTY C…
##  6 26426      69              GROCERY       Priva… SPICES & EXTRAC… SPICES & SE…
##  7 26540      69              GROCERY       Priva… COOKIES/CONES    TRAY PACK/C…
##  8 26601      69              DRUG GM       Priva… VITAMINS         VITAMIN - M…
##  9 26636      69              PASTRY        Priva… BREAKFAST SWEETS SW GDS: SW …
## 10 26691      16              GROCERY       Priva… PNT BTR/JELLY/J… HONEY       
## # ℹ 92,321 more rows
## # ℹ 1 more variable: package_size <chr>

Plot 1: Bar Graph showing the total purchases of the top 5 products combined for each age group

plot1 <- transactions %>%
  inner_join(products, by = "product_id")%>%
  inner_join(demographics, by = "household_id") %>%
  group_by(age = age, product_category) %>%
  summarise(total_purchase = n(), .groups = "drop") %>%
  arrange(desc(total_purchase))

top_categories <- plot1 %>%
  group_by(age) %>%
  slice_max(order_by = total_purchase, n = 5)
top_categories
## # A tibble: 30 × 3
## # Groups:   age [6]
##    age   product_category       total_purchase
##    <ord> <chr>                           <int>
##  1 19-24 SOFT DRINKS                       134
##  2 19-24 FROZEN PIZZA                       86
##  3 19-24 BAKED BREAD/BUNS/ROLLS             74
##  4 19-24 FRZN MEAT/MEAT DINNERS             67
##  5 19-24 CHEESE                             65
##  6 25-34 SOFT DRINKS                       278
##  7 25-34 CHEESE                            241
##  8 25-34 FLUID MILK PRODUCTS               229
##  9 25-34 BAKED BREAD/BUNS/ROLLS            220
## 10 25-34 BAG SNACKS                        189
## # ℹ 20 more rows
ggplot(top_categories, aes(x = age, y = total_purchase, group =)) + 
  geom_col(fill = "blue") + scale_y_continuous("Total Number of Purchases of the Top 5 Brands", label = scales::number) +
  scale_x_discrete("Age Group") + 
  ggtitle("Total Purchases of Top 5 Product Categories", subtitle = "By Each Age Group")

Line Graph to show how many bag snacks and soft drinks are bought depending on the amount of kids in a household

plot2 <- transactions %>%
  inner_join(products, by = "product_id") %>%
  filter(product_category %in% c("SOFT DRINKS", "BAG SNACKS")) %>%
  inner_join(demographics, by = "household_id") %>%
  group_by(kids_count, product_category) %>%
  summarise(total_sales = sum(quantity), .groups = "drop")
plot2
## # A tibble: 8 × 3
##   kids_count product_category total_sales
##   <ord>      <chr>                  <dbl>
## 1 0          BAG SNACKS               722
## 2 0          SOFT DRINKS             1459
## 3 1          BAG SNACKS               236
## 4 1          SOFT DRINKS              524
## 5 2          BAG SNACKS               148
## 6 2          SOFT DRINKS              205
## 7 3+         BAG SNACKS               194
## 8 3+         SOFT DRINKS              195
ggplot(plot2, aes(x = kids_count, y = total_sales, color = product_category, group = product_category)) +
  geom_line(size = 1) + 
  geom_point(color = "black", shape = 8) + 
  scale_color_manual(values = c("SOFT DRINKS" = "pink", "BAG SNACKS" = "red")) + scale_y_continuous("Total Number of Purchases", label = scales::number) +
  scale_x_discrete("Number of Kids in The Household") + 
  ggtitle("Total Sales Of Soft Drinks And Bag Snacks", subtitle = "By The Number Of Kids In A Household")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Scatterplots for each quarter to show the relationship between store_id and quantity bought during a single grocery trip

plot3 <- transactions %>%
  inner_join(demographics, by = "household_id") %>%
   mutate("quarter" = case_when(
    between(week, 1, 13) ~ "Q1",
    between(week, 14, 26) ~ "Q2",
    between(week, 27, 39) ~ "Q3",
    between(week, 40, 52) ~ "Q4")) %>%
  filter(!is.na(quarter)) %>%
  group_by(quarter, store_id, income) %>%
  summarise(quantity = n(), .groups = "drop") %>%
  filter(store_id >= 200 & store_id <= 299.00)
plot3
## # A tibble: 233 × 4
##    quarter store_id income    quantity
##    <chr>   <chr>    <ord>        <int>
##  1 Q1      224      125-149K         1
##  2 Q1      2538     50-74K           3
##  3 Q1      27       35-49K           2
##  4 Q1      276      50-74K           1
##  5 Q1      2816     35-49K           2
##  6 Q1      286      Under 15K        7
##  7 Q1      286      35-49K          11
##  8 Q1      286      125-149K        16
##  9 Q1      2875     150-174K         2
## 10 Q1      288      Under 15K        9
## # ℹ 223 more rows
ggplot(plot3, aes(x = quantity, y = store_id)) +
  geom_point(aes(color = income), size = 3) + 
  facet_wrap(~ quarter) +
  scale_y_discrete("Store Id") +
  scale_x_continuous("Quantity Bought In Each Trip") + 
  ggtitle("Relationship Between Store Id And Quantity Bought During A Grocery Trip", subtitle = "When Store Ids Starts With A 2") + scale_color_viridis_d(option = "plasma")