library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(dplyr)
transactions <- transactions_sample
products
## # A tibble: 92,331 × 7
## product_id manufacturer_id department brand product_category product_type
## <chr> <chr> <chr> <fct> <chr> <chr>
## 1 25671 2 GROCERY Natio… FRZN ICE ICE - CRUSH…
## 2 26081 2 MISCELLANEOUS Natio… <NA> <NA>
## 3 26093 69 PASTRY Priva… BREAD BREAD:ITALI…
## 4 26190 69 GROCERY Priva… FRUIT - SHELF S… APPLE SAUCE
## 5 26355 69 GROCERY Priva… COOKIES/CONES SPECIALTY C…
## 6 26426 69 GROCERY Priva… SPICES & EXTRAC… SPICES & SE…
## 7 26540 69 GROCERY Priva… COOKIES/CONES TRAY PACK/C…
## 8 26601 69 DRUG GM Priva… VITAMINS VITAMIN - M…
## 9 26636 69 PASTRY Priva… BREAKFAST SWEETS SW GDS: SW …
## 10 26691 16 GROCERY Priva… PNT BTR/JELLY/J… HONEY
## # ℹ 92,321 more rows
## # ℹ 1 more variable: package_size <chr>
Plot 1: Bar Graph showing the total purchases of the top 5 products
combined for each age group
plot1 <- transactions %>%
inner_join(products, by = "product_id")%>%
inner_join(demographics, by = "household_id") %>%
group_by(age = age, product_category) %>%
summarise(total_purchase = n(), .groups = "drop") %>%
arrange(desc(total_purchase))
top_categories <- plot1 %>%
group_by(age) %>%
slice_max(order_by = total_purchase, n = 5)
top_categories
## # A tibble: 30 × 3
## # Groups: age [6]
## age product_category total_purchase
## <ord> <chr> <int>
## 1 19-24 SOFT DRINKS 134
## 2 19-24 FROZEN PIZZA 86
## 3 19-24 BAKED BREAD/BUNS/ROLLS 74
## 4 19-24 FRZN MEAT/MEAT DINNERS 67
## 5 19-24 CHEESE 65
## 6 25-34 SOFT DRINKS 278
## 7 25-34 CHEESE 241
## 8 25-34 FLUID MILK PRODUCTS 229
## 9 25-34 BAKED BREAD/BUNS/ROLLS 220
## 10 25-34 BAG SNACKS 189
## # ℹ 20 more rows
ggplot(top_categories, aes(x = age, y = total_purchase, group =)) +
geom_col(fill = "blue") + scale_y_continuous("Total Number of Purchases of the Top 5 Brands", label = scales::number) +
scale_x_discrete("Age Group") +
ggtitle("Total Purchases of Top 5 Product Categories", subtitle = "By Each Age Group")

Line Graph to show how many bag snacks and soft drinks are bought
depending on the amount of kids in a household
plot2 <- transactions %>%
inner_join(products, by = "product_id") %>%
filter(product_category %in% c("SOFT DRINKS", "BAG SNACKS")) %>%
inner_join(demographics, by = "household_id") %>%
group_by(kids_count, product_category) %>%
summarise(total_sales = sum(quantity), .groups = "drop")
plot2
## # A tibble: 8 × 3
## kids_count product_category total_sales
## <ord> <chr> <dbl>
## 1 0 BAG SNACKS 722
## 2 0 SOFT DRINKS 1459
## 3 1 BAG SNACKS 236
## 4 1 SOFT DRINKS 524
## 5 2 BAG SNACKS 148
## 6 2 SOFT DRINKS 205
## 7 3+ BAG SNACKS 194
## 8 3+ SOFT DRINKS 195
ggplot(plot2, aes(x = kids_count, y = total_sales, color = product_category, group = product_category)) +
geom_line(size = 1) +
geom_point(color = "black", shape = 8) +
scale_color_manual(values = c("SOFT DRINKS" = "pink", "BAG SNACKS" = "red")) + scale_y_continuous("Total Number of Purchases", label = scales::number) +
scale_x_discrete("Number of Kids in The Household") +
ggtitle("Total Sales Of Soft Drinks And Bag Snacks", subtitle = "By The Number Of Kids In A Household")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Scatterplots for each quarter to show the relationship between
store_id and quantity bought during a single grocery trip
plot3 <- transactions %>%
inner_join(demographics, by = "household_id") %>%
mutate("quarter" = case_when(
between(week, 1, 13) ~ "Q1",
between(week, 14, 26) ~ "Q2",
between(week, 27, 39) ~ "Q3",
between(week, 40, 52) ~ "Q4")) %>%
filter(!is.na(quarter)) %>%
group_by(quarter, store_id, income) %>%
summarise(quantity = n(), .groups = "drop") %>%
filter(store_id >= 200 & store_id <= 299.00)
plot3
## # A tibble: 233 × 4
## quarter store_id income quantity
## <chr> <chr> <ord> <int>
## 1 Q1 224 125-149K 1
## 2 Q1 2538 50-74K 3
## 3 Q1 27 35-49K 2
## 4 Q1 276 50-74K 1
## 5 Q1 2816 35-49K 2
## 6 Q1 286 Under 15K 7
## 7 Q1 286 35-49K 11
## 8 Q1 286 125-149K 16
## 9 Q1 2875 150-174K 2
## 10 Q1 288 Under 15K 9
## # ℹ 223 more rows
ggplot(plot3, aes(x = quantity, y = store_id)) +
geom_point(aes(color = income), size = 3) +
facet_wrap(~ quarter) +
scale_y_discrete("Store Id") +
scale_x_continuous("Quantity Bought In Each Trip") +
ggtitle("Relationship Between Store Id And Quantity Bought During A Grocery Trip", subtitle = "When Store Ids Starts With A 2") + scale_color_viridis_d(option = "plasma")
