library(ggplot2)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ✔ purrr   0.3.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(dplyr)
library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
library(stringr)
library(tidyr)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
transactions <- get_transactions()
promotions <- get_promotions()
  demographics %>%
  inner_join(transactions,"household_id") %>%
  inner_join(products, by = 'product_id') %>%
  group_by(age) %>%
  summarize(total_discount = retail_disc + coupon_disc + coupon_match_disc) %>%
  arrange(total_discount) %>%
  ggplot(aes(x=age, y=total_discount, fill = age)) + 
  geom_bar(stat = 'identity') +
  scale_y_continuous(name = "Total Discount", labels = scales::dollar) +
  scale_x_discrete(name = "Age Group") +
  labs(
    title = "Total Discount by Age Group", 
  )
## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.

  transactions_sample %>%
  inner_join(demographics, by = 'household_id') %>%
  group_by(age,household_comp) %>%
  summarize(total_discount = sum(coupon_disc)) %>%
  ggplot(aes(x = age, y = total_discount, fill = household_comp)) +
  geom_bar(stat = "identity") +
  guides(fill = guide_legend(title = "Household Composition")) +
  labs(title = "Total Discount by each Age Group classified by Household composition",
       x = "Age Group",
       y = "Total Discount")
## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.

transactions %>%
  inner_join(demographics, by = 'household_id') %>%
  inner_join(products, by = 'product_id') %>%
  mutate(total_discount = retail_disc + coupon_disc + coupon_match_disc) %>%
  filter(total_discount > 0) %>%
  ggplot(aes(x = age , y = total_discount , fill = age)) +
  geom_col() +
  facet_wrap(~department) +
  scale_y_continuous(limits = c(0, 15000), breaks = seq(0, 20000, by = 3000)) +
  labs(
    title = "Total Discount Per Age Group across departments",
    x = "Age Group",
    y = "Total Discount"
  ) +
  theme(axis.text.x=element_blank(),
      axis.ticks.x=element_blank())
## Warning: Removed 204954 rows containing missing values (geom_col).

 transactions %>%
  inner_join(demographics) %>%
  filter(coupon_disc > 0) %>%
  ggplot(aes(x = coupon_disc, y = sales_value, color = kids_count)) +
  geom_smooth(method = "lm") +
  ggtitle("Transaction Sales Value by Coupon Discount and Number of Kids")
## Joining, by = "household_id"
## `geom_smooth()` using formula 'y ~ x'

  transactions %>%
  inner_join(demographics, by = "household_id") %>%
  inner_join(products, by = "product_id") %>%
  group_by(age,brand) %>%
  summarize(total_discount = sum(coupon_disc, na.rm = TRUE), .groups = 'keep') %>%
  ggplot(aes(x = age, y = total_discount, color = brand)) +
  geom_point() +
  scale_y_continuous(name = "Total Discount", labels = scales::dollar) +
  labs(title = "National vs Private Total Discount per Age Range",
       subtitle = "Comparing total discount of two competing brands per income level",
       x = "Age Range",
       color = "Brand")

  transactions %>%
  inner_join(products) %>%
  group_by(manufacturer_id) %>%
  filter(sales_value > 10) %>%
  ggplot(aes(x = retail_disc, y = sales_value)) +
  geom_smooth(method = "lm") +
  facet_wrap(~ department) +
  ggtitle("Retail Discount vs Sales Value by Department")
## Joining, by = "product_id"
## `geom_smooth()` using formula 'y ~ x'