library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
library(ggplot2)
# Graph to compare avg sales by month in transactions_sample v. full transactions
transactions_sample
## # A tibble: 75,000 × 11
## household_id store_id basket_id product_id quantity sales_value retail_disc
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2261 309 31625220889 940996 1 3.86 0.43
## 2 2131 368 32053127496 873902 1 1.59 0.9
## 3 511 316 32445856036 847901 1 1 0.69
## 4 400 388 31932241118 13094913 2 11.9 2.9
## 5 918 340 32074655895 1085604 1 1.29 0
## 6 718 324 32614612029 883203 1 2.5 0.49
## 7 868 323 32074722463 9884484 1 3.49 0
## 8 1688 450 34850403304 1028715 1 2 1.79
## 9 467 31782 31280745102 896613 2 6.55 4.44
## 10 1947 32004 32744181707 978497 1 3.99 0
## # ℹ 74,990 more rows
## # ℹ 4 more variables: coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>
transactions_full <- get_transactions()
ts_1 <- transactions_sample %>%
separate(transaction_timestamp, into = c("transaction_year", "transaction_month", "transaction_day_time"), sep = "-")
ts_2 <- ts_1 %>%
separate(transaction_day_time, into = c("transaction_day", "transaction_time"), sep = " ")
t_1 <- transactions_full %>%
separate(transaction_timestamp, into = c("transaction_year", "transaction_month", "transaction_day_time"), sep = "-")
t_2 <- t_1 %>%
separate(transaction_day_time, into = c("transaction_day", "transaction_time"), sep = " ")
ts <- ts_2 %>%
group_by(transaction_month) %>%
summarise(avg_sales = mean(sales_value))
t <- t_2 %>%
group_by(transaction_month) %>%
summarise(avg_sales = mean(sales_value))
ts_v_t <- left_join(ts, t, by = "transaction_month") %>%
rename(
avg_sales_sample = avg_sales.x,
avg_sales_full = avg_sales.y
)
df <- ts_v_t %>%
mutate(transaction_month = as.numeric(transaction_month)) %>%
mutate(transaction_month = factor(month.name[transaction_month],
levels = month.name)) %>%
mutate(avg_sales_sample = as.numeric(avg_sales_sample)) %>%
mutate(avg_sales_full = as.numeric(avg_sales_full))
ggplot(df, aes(x = transaction_month)) +
geom_line(aes(y = avg_sales_sample, color = "Transactions Sample", group = 1)) +
geom_line(aes(y = avg_sales_full, color = "Full Transactions", group = 2)) +
geom_point(aes(y = avg_sales_sample, color = "Transactions Sample")) +
geom_point(aes(y = avg_sales_full, color = "Full Transactions")) +
labs(title = "Average Sales by Month: Full Transactions Dataset v. Transactions Sample Dataset", x = "Month", y = "Average Sales") +
scale_y_continuous(labels = scales::dollar) +
scale_color_manual(name = "Legend", values = c("Transactions Sample" = "blue", "Full Transactions" = "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
axis.title.x = element_text(margin = margin(t = 10)))

transactions <- transactions_sample
data <- merge(transactions, campaigns)
sorted_data <- data %>%
mutate(campaign_id = fct_inseq(campaign_id)) %>%
arrange(desc("campaign_id"))
ggplot(sorted_data, aes(x = campaign_id, y = coupon_disc)) + geom_histogram(color = "green", stat = "identity") +
labs(title = "Total Customer Savings by Campaign",
x = "Campaign ID",
y = "Customer Savings") +
scale_y_continuous(labels = scales::dollar) +
theme_minimal()
## Warning in geom_histogram(color = "green", stat = "identity"): Ignoring unknown
## parameters: `binwidth`, `bins`, and `pad`

transactions <- transactions_sample
df <- inner_join(demographics, transactions, by = "household_id")
data <- df %>%
group_by(household_id) %>%
reframe(total_sales = sum(sales_value), age = age)
ggplot(data, aes(x = age, y = total_sales)) +
geom_bar(stat = "identity", color = "orange") +
labs(title = "Total Spend per Household by Age", x = "Household Age", y = "Household Spend") +
scale_y_continuous(labels = scales::dollar) +
theme_minimal()
