Module 5 visualization lab

This document contains three informative plots to provide unique insights and tell story about the completejourney database.

Calling the library for completejourney dataset.

household_exp <- transactions %>%
  left_join(demographics, by = "household_id") %>%
  group_by(household_id) %>%
  mutate(loyalty_card_price = (sales_value - (retail_disc + coupon_match_disc)) / ifelse(quantity == 0 | is.na(quantity), 1, quantity)) %>%
  summarize(household_spending = sum(loyalty_card_price, na.rm = TRUE), 
            household_income = first(income)) %>%
  arrange(desc(household_income)) %>%
  na.omit(household_exp)

ggplot(data.frame(household_exp), aes(x = factor(household_income), y = household_spending)) +
  geom_jitter(color = "darkblue", size = 1.5, alpha = 0.3, width = 0.2) +
  geom_boxplot(outlier.shape = NA, fill = "skyblue", color = "darkblue", alpha = 0.6) +  # Boxplot with transparency
  stat_boxplot(geom = "errorbar", width = 0.2, color = "darkblue") +  
  theme_minimal(base_size = 15) +  # Clean theme with larger base font size
  theme(
    plot.title = element_text(hjust = 0.5, size = 15),
    plot.subtitle = element_text(hjust = 0.5, size = 11),
    axis.title = element_text(size = 12),  
    axis.text.x = element_text(angle = 60, hjust = 1, size = 9),  
    panel.grid.major = element_line(color = "lightgray"),  
    panel.grid.minor = element_blank(),  
    axis.ticks = element_blank()  
  ) +
  labs(
    title = "Total spending per total income for each household",
    subtitle = "Analyzing the total spending of each household based on their income bracket",  # Subtitle added
    x = "Household Income",
    y = "Household Spending"
  ) +
  scale_y_continuous(labels = scales::dollar_format()) +
  scale_x_discrete(labels = function(x) strtrim(x, 10))

demographics %>%
  group_by(kids_count, income) %>%
  inner_join(transactions, by = "household_id") %>%
  summarize(total_sales = sum(sales_value, na.rm = TRUE), .groups = 'drop') %>%
  ggplot(aes(x = kids_count, y = total_sales)) +
  geom_jitter(aes(color = income), width = 0.2, height = 0) +  
  scale_y_continuous(name = "Total Sales", labels = scales::dollar_format()) +
  labs(
    title = "Total Sales of All Products per Income Range per and Kids Count",
    subtitle = "Comparing the total sales of all products amongst income ranges and kids count",
    x = "Kids Count",
    y = "Total Sales",
    color = "Income"
  ) +
  theme_minimal()

lollipop_data <- transactions %>%
  left_join(products, by = "product_id") %>%
  group_by(product_category) %>%
  summarize(avg_spending = mean(sales_value, na.rm = TRUE)) %>%
  arrange(desc(avg_spending)) %>%  # Sort by average spending in descending order
  slice_head(n = 5)  # Select the top 5 product categories

ggplot(lollipop_data, aes(x = reorder(product_category, avg_spending), y = avg_spending)) +
  geom_point(size = 4, color = "blue") +
  geom_segment(aes(x = product_category, xend = product_category, y = 0, yend = avg_spending), color = "blue") +
  coord_flip() +
  labs(
    title = "Average Spending by Top 5 Product Categories",
    subtitle = "Highlighting the highest average spending across product categories",  # Added subtitle
    x = "Product Category",
    y = "Average Spending ($)"
  ) +
  theme_minimal()