library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
library(ggplot2)
# Graph to compare avg sales by month in transactions_sample  v. full transactions
transactions_sample
## # A tibble: 75,000 × 11
##    household_id store_id basket_id   product_id quantity sales_value retail_disc
##    <chr>        <chr>    <chr>       <chr>         <dbl>       <dbl>       <dbl>
##  1 2261         309      31625220889 940996            1        3.86        0.43
##  2 2131         368      32053127496 873902            1        1.59        0.9 
##  3 511          316      32445856036 847901            1        1           0.69
##  4 400          388      31932241118 13094913          2       11.9         2.9 
##  5 918          340      32074655895 1085604           1        1.29        0   
##  6 718          324      32614612029 883203            1        2.5         0.49
##  7 868          323      32074722463 9884484           1        3.49        0   
##  8 1688         450      34850403304 1028715           1        2           1.79
##  9 467          31782    31280745102 896613            2        6.55        4.44
## 10 1947         32004    32744181707 978497            1        3.99        0   
## # ℹ 74,990 more rows
## # ℹ 4 more variables: coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## #   transaction_timestamp <dttm>
transactions_full <- get_transactions()

ts_1 <- transactions_sample %>%
  separate(transaction_timestamp, into = c("transaction_year", "transaction_month", "transaction_day_time"), sep = "-")
ts_2 <- ts_1 %>%
  separate(transaction_day_time, into = c("transaction_day", "transaction_time"), sep = " ")

t_1 <- transactions_full %>%
  separate(transaction_timestamp, into = c("transaction_year", "transaction_month", "transaction_day_time"), sep = "-")
t_2 <- t_1 %>%
  separate(transaction_day_time, into = c("transaction_day", "transaction_time"), sep = " ")

ts <- ts_2 %>%
  group_by(transaction_month) %>%
  summarise(avg_sales = mean(sales_value))
t <- t_2 %>%
  group_by(transaction_month) %>%
  summarise(avg_sales = mean(sales_value))

ts_v_t <- left_join(ts, t, by = "transaction_month") %>%
  rename(
    avg_sales_sample = avg_sales.x,
    avg_sales_full = avg_sales.y
  )
df <- ts_v_t %>%  
  mutate(transaction_month = as.numeric(transaction_month)) %>%
  mutate(transaction_month = factor(month.name[transaction_month], 
                                    levels = month.name)) %>%
  mutate(avg_sales_sample = as.numeric(avg_sales_sample)) %>%
  mutate(avg_sales_full = as.numeric(avg_sales_full))

ggplot(df, aes(x = transaction_month)) +
  geom_line(aes(y = avg_sales_sample, color = "Transactions Sample", group = 1)) +
  geom_line(aes(y = avg_sales_full, color = "Full Transactions", group = 2)) +
  geom_point(aes(y = avg_sales_sample, color = "Transactions Sample")) +
  geom_point(aes(y = avg_sales_full, color = "Full Transactions")) +
  labs(title = "Average Sales by Month: Full Transactions Dataset v. Transactions Sample Dataset", x = "Month", y = "Average Sales") +
  scale_y_continuous(labels = scales::dollar) +
  scale_color_manual(name = "Legend", values = c("Transactions Sample" = "blue", "Full Transactions" = "red")) +
  theme_minimal()  +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
        axis.title.x = element_text(margin = margin(t = 10)))

transactions <- transactions_sample
data <- merge(transactions, campaigns)
sorted_data <- data %>%
  mutate(campaign_id = fct_inseq(campaign_id)) %>%
  arrange(desc("campaign_id"))

ggplot(sorted_data, aes(x = campaign_id, y = coupon_disc)) + geom_histogram(color = "green", stat = "identity") +
  labs(title = "Total Customer Savings by Campaign",
       x = "Campaign ID",
       y = "Customer Savings") +
  scale_y_continuous(labels = scales::dollar) +
  theme_minimal()
## Warning in geom_histogram(color = "green", stat = "identity"): Ignoring unknown
## parameters: `binwidth`, `bins`, and `pad`

transactions <- transactions_sample

df <- inner_join(demographics, transactions, by = "household_id")

data <- df %>%
  group_by(household_id) %>%
  reframe(total_sales = sum(sales_value), age = age)

ggplot(data, aes(x = age, y = total_sales)) +
  geom_bar(stat = "identity", color = "orange") +
  labs(title = "Total Spend per Household by Age", x = "Household Age", y = "Household Spend") +
  scale_y_continuous(labels = scales::dollar) +
  theme_minimal()