Midterm Project

library(ggplot2)
library(completejourney)

## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(RColorBrewer)
library(viridisLite)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(scales)
library(knitr)
library(rmarkdown)

products

## # A tibble: 92,331 × 7
##    product_id manufacturer_id department    brand  product_category product_type
##    <chr>      <chr>           <chr>         <fct>  <chr>            <chr>       
##  1 25671      2               GROCERY       Natio… FRZN ICE         ICE - CRUSH…
##  2 26081      2               MISCELLANEOUS Natio… <NA>             <NA>        
##  3 26093      69              PASTRY        Priva… BREAD            BREAD:ITALI…
##  4 26190      69              GROCERY       Priva… FRUIT - SHELF S… APPLE SAUCE 
##  5 26355      69              GROCERY       Priva… COOKIES/CONES    SPECIALTY C…
##  6 26426      69              GROCERY       Priva… SPICES & EXTRAC… SPICES & SE…
##  7 26540      69              GROCERY       Priva… COOKIES/CONES    TRAY PACK/C…
##  8 26601      69              DRUG GM       Priva… VITAMINS         VITAMIN - M…
##  9 26636      69              PASTRY        Priva… BREAKFAST SWEETS SW GDS: SW …
## 10 26691      16              GROCERY       Priva… PNT BTR/JELLY/J… HONEY       
## # ℹ 92,321 more rows
## # ℹ 1 more variable: package_size <chr>

transactions <- transactions_sample
summary(transactions_sample$transaction_timestamp)

##                       Min.                    1st Qu. 
## "2017-01-01 07:30:27.0000" "2017-04-01 13:23:49.0000" 
##                     Median                       Mean 
## "2017-07-02 11:43:04.0000" "2017-07-02 09:39:57.5612" 
##                    3rd Qu.                       Max. 
## "2017-10-02 16:28:50.2500" "2017-12-31 22:47:38.0000"

summary(transactions_sample$day_of_week)

## Warning: Unknown or uninitialised column: `day_of_week`.

## Length  Class   Mode 
##      0   NULL   NULL

summary(demographics$income)

## Under 15K    15-24K    25-34K    35-49K    50-74K    75-99K  100-124K  125-149K 
##        61        74        77       172       192        96        34        38 
##  150-174K  175-199K  200-249K     250K+ 
##        30        11         5        11

sales_data <- demographics %>%
  filter(kids_count >= 2) %>%
  inner_join(transactions, by = "household_id") %>%
  group_by(kids_count) %>%
  summarize(
    total_sales = sum(sales_value, na.rm = TRUE),
    avg_sales_per_household = mean(sales_value, na.rm = TRUE),
    total_transactions = n(),
    num_households = n_distinct(household_id),
    .groups = "drop")
  ggplot(sales_data, aes(x = factor(kids_count), y = total_sales, fill = kids_count)) +
  geom_col(color = "black", show.legend = FALSE) +
  geom_text(aes(label = scales::dollar(total_sales)), vjust = -0.5, size = 4) +
  scale_y_continuous(labels = dollar) +
  labs(
    title = "Total Sales by Household Size (>= 2)",
    subtitle = "2017 Grocery Data From Complete Journey",
    x = "Number of Kids in Household",
    y = "Total Sales ($)" ) +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 10),
        axis.text.y = element_text(size = 10),
        plot.title = element_text(size = 14, face = "bold"),
        plot.subtitle = element_text(size = 12, face = "italic")) +
  scale_fill_viridis_d()

sales_data %>%
  arrange(desc(total_sales)) %>%
  select(kids_count, num_households, total_sales, avg_sales_per_household, total_transactions)

## # A tibble: 2 × 5
##   kids_count num_households total_sales avg_sales_per_household
##   <ord>               <int>       <dbl>                   <dbl>
## 1 3+                     69      12738.                    3.16
## 2 2                      60      12196.                    3.01
## # ℹ 1 more variable: total_transactions <int>

sales_data <- transactions %>%
  inner_join(products, by = "product_id") %>%
  inner_join(demographics, by = "household_id") %>%
  mutate(marital_status = ifelse(is.na(marital_status), "Unknown", marital_status)) %>% # Handle NA values
  group_by(brand, marital_status) %>%
  summarize(
    total_quantity = sum(quantity, na.rm = TRUE),
    avg_quantity_per_purchase = mean(quantity, na.rm = TRUE),
    total_sales_value = sum(sales_value, na.rm = TRUE),
    num_transactions = n(),
    .groups = "drop" )

ggplot(sales_data, aes(x = marital_status, y = total_quantity, color = brand)) +
  geom_point(aes(size = total_sales_value), alpha = 0.7) + # Size represents total sales value
  geom_text(aes(label = scales::comma(total_quantity)), vjust = -0.5, size = 3.5, check_overlap = TRUE) +
  facet_wrap(~ brand, scales = "free_y") + # Separate plots for each brand
  scale_y_log10(labels = scales::comma) +
  scale_color_manual(values = c("red", "blue", "green", "purple", "orange", "brown")) + # Customize colors
  labs(
    title = "Total Quantity Purchased vs. Marital Status by Brand",
    subtitle = "Analysis of Product Sales Across Household Marital Status",
    x = "Marital Status",
    y = "Total Quantity Purchased (Log Scale)",
    size = "Total Sales ($)",
    color = "Brand") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 12, face = "italic"),
    legend.position = "right")

sales_data %>%
  arrange(desc(total_quantity)) %>%
  select(brand, marital_status, total_quantity, avg_quantity_per_purchase, total_sales_value, num_transactions)

## # A tibble: 6 × 6
##   brand   marital_status total_quantity avg_quantity_per_pur…¹ total_sales_value
##   <fct>   <chr>                   <dbl>                  <dbl>             <dbl>
## 1 Private 1                     2547669                 500.              17398.
## 2 Private 2                     1662707                 356.              14201.
## 3 Private Unknown                708589                 326.               6478.
## 4 Nation… 1                       17992                   1.30            44513.
## 5 Nation… 2                       14606                   1.30            35002.
## 6 Nation… Unknown                  6822                   1.35            16022.
## # ℹ abbreviated name: ¹avg_quantity_per_purchase
## # ℹ 1 more variable: num_transactions <int>

data_merged <- transactions_sample %>%
  left_join(products, by = "product_id") %>%
  left_join(demographics, by = "household_id")

income_levels <- c("Under 15K", "15-24K", "25-34K", "35-49K", "50-74K", "75-99K", "100-124K", "125-149K", "150-174K", "175-199K")
category_sales <- data_merged %>%
  filter(income %in% income_levels) %>%  # Filter income levels
  group_by(product_category, income) %>%
  summarise(total_sales_value = sum(sales_value, na.rm = TRUE), .groups = 'drop')
top_categories <- category_sales %>%
  group_by(product_category) %>%
  summarise(total_sales_value = sum(total_sales_value, na.rm = TRUE)) %>%
  arrange(desc(total_sales_value)) %>%
  slice_head(n = 5) %>%  # Select top 5 categories
  pull(product_category)
filtered_data <- category_sales %>%
  filter(product_category %in% top_categories)
ggplot(filtered_data, aes(x = reorder(product_category, -total_sales_value), 
                          y = total_sales_value, fill = income)) +
  geom_col(position = position_dodge(width = 0.8)) +  
  labs(title = "Top 5 Product Categories vs Household Income Levels (< 199K)",
       subtitle = "Total sales value for top product categories across income levels",
       x = "Product Category",
       y = "Total Sales Value",
       fill = "Income Level") +
  scale_y_continuous(limits = c(0, max(filtered_data$total_sales_value) * 1.1)) +  
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  
  scale_fill_brewer(palette = "Paired")

Midterm Project

Khoi Pham, Truc Huynh

2025-02-25