#****************************************************** # Data Mining: Module 5 Lab # # Last Modified: 2/15/2025 # # Authors: Sanjana Chenna #****************************************************

# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
install.packages("tidytext",repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/lw/gx8gsz0n7nbbv6v68cxhl7z00000gn/T//RtmpaL3CAu/downloaded_packages
library(tidytext)

# Load the completejourney data
data("transactions_sample")
data("products")
data("demographics")
# ******* PLOT ONE *******

# Join transactions with products and demographics
merged_data <- transactions_sample %>%
  inner_join(products, by = "product_id") %>%
  inner_join(demographics, by = "household_id")

# Ensure sales_value is numeric and remove missing values
merged_data <- merged_data %>%
  mutate(sales_value = as.numeric(sales_value)) %>%
  filter(!is.na(sales_value))

# Aggregate total sales by product category and income level
category_sales <- merged_data %>%
  group_by(income, product_category) %>%
  summarise(total_sales = sum(sales_value, na.rm = TRUE), .groups = "drop")

# Identify the top 5 product categories per income level
top_categories <- category_sales %>%
  group_by(income) %>%
  slice_max(order_by = total_sales, n = 5) %>%
  ungroup()

# Reorder product categories for better visualization
top_categories <- top_categories %>%
  mutate(product_category = reorder_within(product_category, total_sales, income))

# Plot the top 5 product categories by income level using bar charts
ggplot(top_categories, aes(x = product_category, y = total_sales, fill = income)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +  # Flips the bar chart for better readability
  facet_wrap(~income, scales = "free_y") +
  scale_x_reordered() +  # Keeps ordering within each facet
  labs(title = "Top 5 Product Categories by Income Level",
       subtitle = "Total sales value of the top 5 categories for each income group",
       x = "Product Category",
       y = "Total Sales ($)",
       caption = "Data Source: completejourney package") +
  theme_minimal() +
  theme(axis.text.x = element_text(size=20,angle = 45, hjust = 1),
        axis.text.y = element_text(size = 20),
        strip.text = element_text(size = 25, face = "bold"))

# ******* PLOT TWO *******

# Convert date format and extract week
transactions <- transactions_sample %>%
  mutate(transaction_week = floor_date(as.Date(transaction_timestamp), unit = "week"))

# Join transactions with demographics to get income levels
weekly_sales <- transactions_sample %>%
  inner_join(demographics, by = "household_id") %>%
  group_by(income, week) %>%
  summarise(total_sales = sum(sales_value, na.rm = TRUE), .groups = "drop")

# Plot the time series of weekly sales by income level
ggplot(weekly_sales, aes(x = week, y = total_sales, color = income)) +
  geom_line(size = 1) +
  facet_wrap(~income, scales = "free_y") +  # Creates separate plots for each income group
  labs(title = "Weekly Sales Trends by Income Group",
       subtitle = "Tracking weekly total sales across different income levels",
       x = "Week",
       y = "Total Sales ($)",
       caption = "Data Source: completejourney package") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none",
        strip.text = element_text(size = 12, face = "bold"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot(weekly_sales, aes(x = week, y = total_sales, color = income, group = income)) +
  geom_line(size = 1) +
  labs(title = "Weekly Sales Trends Across Income Groups",
       subtitle = "Total weekly sales over time, grouped by income level",
       x = "Week",
       y = "Total Sales ($)",
       color = "Income Level",
       caption = "Data Source: completejourney package") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "bottom",
        legend.title = element_text(face = "bold"),
        legend.text = element_text(size = 10))

# ***** PLOT THREE *****

# Join transactions with demographics to get income levels
discount_data <- transactions_sample %>%
  inner_join(demographics, by = "household_id") %>%
  mutate(discount_amount = coupon_disc + coupon_match_disc) %>%  # Total discount applied
  group_by(income) %>%
  summarise(total_spent = sum(sales_value, na.rm = TRUE),
            total_discount = sum(discount_amount, na.rm = TRUE),
            discount_percentage = (total_discount / total_spent) * 100) %>%
  ungroup()

# Plot discount percentage by income level
ggplot(discount_data, aes(x = income, y = discount_percentage, fill = income)) +
  geom_col(show.legend = FALSE) +
  labs(title = "Discount Usage by Income Level",
       subtitle = "Percentage of total spending from discounts across income groups",
       x = "Income Level",
       y = "Discount Percentage (%)",
       caption = "Data Source: completejourney package") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        axis.text.y = element_text(size = 10))