Lab 4

library(completejourney)

## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(RColorBrewer)
library(viridisLite)

PLOT 1

summary(transactions_sample$transaction_timestamp)

##                       Min.                    1st Qu. 
## "2017-01-01 07:30:27.0000" "2017-04-01 13:23:49.0000" 
##                     Median                       Mean 
## "2017-07-02 11:43:04.0000" "2017-07-02 09:39:57.5612" 
##                    3rd Qu.                       Max. 
## "2017-10-02 16:28:50.2500" "2017-12-31 22:47:38.0000"

summary(transactions_sample$day_of_week)

## Warning: Unknown or uninitialised column: `day_of_week`.

## Length  Class   Mode 
##      0   NULL   NULL

summary(demographics$income)

## Under 15K    15-24K    25-34K    35-49K    50-74K    75-99K  100-124K  125-149K 
##        61        74        77       172       192        96        34        38 
##  150-174K  175-199K  200-249K     250K+ 
##        30        11         5        11

transactions_sample <- transactions_sample %>%
  mutate(day_of_week = wday(transaction_timestamp, label = TRUE))
weekly_sales <- transactions_sample %>%
  left_join(demographics, by = "household_id") %>%
  group_by(day_of_week, income) %>%
  summarise(total_sales_value = sum(sales_value, na.rm = TRUE))

## `summarise()` has grouped output by 'day_of_week'. You can override using the
## `.groups` argument.

head(weekly_sales)

## # A tibble: 6 × 3
## # Groups:   day_of_week [1]
##   day_of_week income    total_sales_value
##   <ord>       <ord>                 <dbl>
## 1 Sun         Under 15K             1569.
## 2 Sun         15-24K                1639.
## 3 Sun         25-34K                2004.
## 4 Sun         35-49K                4295.
## 5 Sun         50-74K                6151.
## 6 Sun         75-99K                3340.

max_sales_value <- max(weekly_sales$total_sales_value, na.rm = TRUE)
ggplot(weekly_sales, aes(x = day_of_week, y = total_sales_value, fill = income)) +
  geom_col(position = "dodge") +
  labs(title = "Weekly Sales Trend by Household Income Levels",
       subtitle = "Total sales value by day of the week segmented by income",
       x = "Day of the Week",
       y = "Total Sales Value",
       fill = "Income Level") +
  scale_y_continuous(limits = c(0, 8000)) + 
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_col()`).

PLOT 2

coupon_income_data <- coupon_redemptions %>%
  left_join(demographics, by = "household_id")
coupon_redemption_rate <- coupon_income_data %>%
  group_by(income) %>%
  summarise(total_redemptions = n())
ggplot(coupon_redemption_rate, aes(x = income, y = total_redemptions, fill = income)) +
  geom_bar(stat = "identity") +
  labs(
    title = "Coupon Redemption Rate vs Household Income",
    subtitle = "Mostly Around 50 - 74K",
    x = "Household Income",
    y = "Total Coupon Redemptions"
  ) +
  theme_minimal()

PLOT 3

data_merged <- transactions_sample %>%
  left_join(products, by = "product_id") %>%
  left_join(demographics, by = "household_id")
category_sales <- data_merged %>%
  group_by(product_category, income) %>%
  summarise(total_sales_value = sum(sales_value, na.rm = TRUE), .groups = 'drop')
top_categories <- category_sales %>%
  group_by(product_category) %>%
  summarise(total_sales_value = sum(total_sales_value, na.rm = TRUE)) %>%
  top_n(10, wt = total_sales_value) %>%
  pull(product_category)
filtered_data <- category_sales %>%
  filter(product_category %in% top_categories)
ggplot(filtered_data, aes(x = reorder(product_category, -total_sales_value), 
                          y = total_sales_value, fill = income)) +
  geom_col(position = position_dodge(width = 1)) +  
  labs(title = "Top 10 Product Categories vs Household Income Levels",
       subtitle = "Total sales value for top product categories across income levels",
       x = "Product Category",
       y = "Total Sales Value",
       fill = "Income Level") +
  scale_y_continuous(limits = c(0, 3000)) +  
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  
  scale_fill_brewer(palette = "Set3")

## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_col()`).

Lab 4

Truc Huynh

2025-02-11

PLOT 1

PLOT 2

PLOT 3