Shams Sadhin

02-16-2025

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
transactions <- get_transactions()
transactions
#Plot 1

# Summarizing total quantity of vitamins by product type
top_vitamins <- transactions %>%
  inner_join(products, by = 'product_id') %>%
  inner_join(demographics, by = 'household_id') %>%
  filter(product_category == 'VITAMINS') %>%
  group_by(product_type) %>%
  summarise(total_quantity = sum(quantity, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(total_quantity)) %>%
  slice_head(n = 6) 

# Filtering transactions to include the top 6 vitamin types
vitamin_data <- transactions %>%
  inner_join(products, by = 'product_id') %>%
  inner_join(demographics, by = 'household_id') %>%
  filter(product_category == 'VITAMINS', product_type %in% top_vitamins$product_type) %>%
  group_by(age, product_type) %>%
  summarise(total_quantity = sum(quantity, na.rm = TRUE), .groups = "drop")

# Creating a bar plot 
ggplot(vitamin_data, aes(x = reorder(product_type, total_quantity), y = total_quantity, fill = as.factor(age))) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Types of Vitamin Consumption by Age Group",
       x = "Vitamins Type",
       y = "Total Quantity",
       fill = "Age") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90 , vjust = 0.5, hjust = 1)) 

#Plot 2

transactions <- get_transactions()
demographics 
# Merge transactions with demographics 
coupon_data <- transactions %>%
  inner_join(demographics, by = "household_id") %>%
  group_by(age, household_id) %>%
  summarise(total_coupon_discount = sum(coupon_disc, na.rm = TRUE), .groups = "drop")

# Visualization: 
ggplot(coupon_data, aes(x = age, y = total_coupon_discount, fill = age)) +
  geom_boxplot(alpha = 0.7, width = 0.6, outlier.color = "black", outlier.size = 1.5) + 
  scale_fill_brewer(palette = "Set3") +
  scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, by = 10)) +  
  labs(
    title = "Total Coupon Discount Redeemed by Age Group",
    x = "Age Group",
    y = "Total Coupon Discount"
  ) +
  theme_minimal() +
  theme(legend.position = "none",  
        plot.title = element_text(hjust = 0.5, size = 16, face = "bold"))
## Warning: Removed 38 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# Plot 3

# Get top 6 bread product types
top_bread_types <- products %>%
  filter(product_category == "BREAD") %>%
  inner_join(transactions, by = 'product_id') %>%
  group_by(product_type) %>%
  summarise(total_quantity = sum(quantity, na.rm = TRUE), .groups = "drop") %>%
  slice_max(total_quantity, n = 6) %>%
  pull(product_type) 

# Create scatter plot 
products %>%
  filter(product_category == "BREAD", product_type %in% top_bread_types) %>%
  inner_join(transactions, by = 'product_id') %>%
  inner_join(demographics, by = 'household_id') %>%
  ggplot(aes(x = quantity, y = product_type)) +
  geom_point(aes(color = income), alpha = 0.7) + 
  facet_wrap(~ age) +
  scale_color_brewer(palette = "Set3") +
  labs(
    title = "Top Bread Products Purchased By Different Age-Groups",
    x = "Quantity Purchased",
    y = "Different Product Types of Bread"
  ) +
  guides(color = guide_legend(title = "Income Group")) +
  theme_minimal()

```