# ===================== Setup =====================
setwd("C:/Users/manuk/OneDrive/Desktop")  

packages <- c("completejourney", "dplyr", "tidyr", "ggplot2")
lapply(packages, function(pkg) {
  if (!require(pkg, character.only = TRUE)) install.packages(pkg)
  library(pkg, character.only = TRUE)
})
## [[1]]
## [1] "completejourney" "stats"           "graphics"        "grDevices"      
## [5] "utils"           "datasets"        "methods"         "base"           
## 
## [[2]]
## [1] "dplyr"           "completejourney" "stats"           "graphics"       
## [5] "grDevices"       "utils"           "datasets"        "methods"        
## [9] "base"           
## 
## [[3]]
##  [1] "tidyr"           "dplyr"           "completejourney" "stats"          
##  [5] "graphics"        "grDevices"       "utils"           "datasets"       
##  [9] "methods"         "base"           
## 
## [[4]]
##  [1] "ggplot2"         "tidyr"           "dplyr"           "completejourney"
##  [5] "stats"           "graphics"        "grDevices"       "utils"          
##  [9] "datasets"        "methods"         "base"
transactions <- get_transactions()
transactions
## # A tibble: 1,469,307 × 11
##    household_id store_id basket_id   product_id quantity sales_value retail_disc
##    <chr>        <chr>    <chr>       <chr>         <dbl>       <dbl>       <dbl>
##  1 900          330      31198570044 1095275           1        0.5         0   
##  2 900          330      31198570047 9878513           1        0.99        0.1 
##  3 1228         406      31198655051 1041453           1        1.43        0.15
##  4 906          319      31198705046 1020156           1        1.5         0.29
##  5 906          319      31198705046 1053875           2        2.78        0.8 
##  6 906          319      31198705046 1060312           1        5.49        0.5 
##  7 906          319      31198705046 1075313           1        1.5         0.29
##  8 1058         381      31198676055 985893            1        1.88        0.21
##  9 1058         381      31198676055 988791            1        1.5         1.29
## 10 1058         381      31198676055 9297106           1        2.69        0   
## # ℹ 1,469,297 more rows
## # ℹ 4 more variables: coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## #   transaction_timestamp <dttm>
demographics <- demographics
products <- products

# ===================== Categories =====================

PASL_income <- c("50-74K", "75-99K", "100-124K", "125-149K", 
                 "150-174K", "175-199K", "200-249K", "250K+")
High_income <- c("75-99K", "100-124K", "125-149K", 
                 "150-174K", "175-199K", "200-249K", "250K+")
low_income <- c("Under 15K", "15-24K", "25-34K", "35-49K")
PASL_age <- c("25-34", "35-44", "45-54")

demographics <- demographics %>%
  mutate(
    kids_count = as.numeric(kids_count),
    kids_count = ifelse(is.na(kids_count), 0, kids_count),
    in_PASL_income = income %in% PASL_income,       
    in_High_income = income %in% High_income,           
    in_low_income = income %in% low_income,         
    looks_stable = in_High_income | (                 
      (marital_status == "Married") &
        (kids_count > 0) &
        (age %in% PASL_age) &
        (tolower(home_ownership) %in% c("own", "homeowner", "yes"))
    )
  )

head(demographics)
## # A tibble: 6 × 12
##   household_id age   income    home_ownership marital_status household_size
##   <chr>        <ord> <ord>     <ord>          <ord>          <ord>         
## 1 1            65+   35-49K    Homeowner      Married        2             
## 2 1001         45-54 50-74K    Homeowner      Unmarried      1             
## 3 1003         35-44 25-34K    <NA>           Unmarried      1             
## 4 1004         25-34 15-24K    <NA>           Unmarried      1             
## 5 101          45-54 Under 15K Homeowner      Married        4             
## 6 1012         35-44 35-49K    <NA>           Married        5+            
## # ℹ 6 more variables: household_comp <ord>, kids_count <dbl>,
## #   in_PASL_income <lgl>, in_High_income <lgl>, in_low_income <lgl>,
## #   looks_stable <lgl>
# ===================== Graph 1 — Income above $50K =====================
PASL_table_1 <- demographics %>%
  summarise(
    Above_50K = sum(in_PASL_income, na.rm = TRUE),
    Below_50K = n() - sum(in_PASL_income, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = everything(), names_to = "Category", values_to = "Value") %>%
  mutate(Percentage = round(Value / sum(Value) * 100, 1))

ggplot(PASL_table_1, aes(x = "", y = Value, fill = Category)) +
  geom_bar(stat = "identity", width = 1, color = "white") +
  coord_polar("y", start = 0) +
  geom_text(aes(label = paste0(Percentage, "%")), position = position_stack(vjust = 0.5)) +
  theme_void() +
  ggtitle("Individuals with Income more than $50K") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  scale_fill_brewer(palette = "Set2")

# ===================== Graph 2 — Stability =====================
PASL_table_2 <- demographics %>%
  summarise(
    Stable = sum(looks_stable, na.rm = TRUE),
    Unstable = n() - sum(looks_stable, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = everything(), names_to = "Category", values_to = "Value") %>%
  mutate(Percentage = round(Value / sum(Value) * 100, 1))

ggplot(PASL_table_2, aes(x = "", y = Value, fill = Category)) +
  geom_bar(stat = "identity", width = 1, color = "white") +
  coord_polar("y", start = 0) +
  geom_text(aes(label = paste0(Percentage, "%")), position = position_stack(vjust = 0.5)) +
  theme_void() +
  ggtitle("Individuals Representing Stability") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  scale_fill_brewer(palette = "Set3")

# ===================== Graph 3 — Income vs Stability =====================
PASL_combined <- demographics %>%
  group_by(in_PASL_income, looks_stable) %>%
  summarise(Count = n(), .groups = "drop") %>%
  mutate(
    Income = ifelse(in_PASL_income, "Above $50K", "Below $50K"),
    Stability = ifelse(looks_stable, "Stable", "Unstable")
  )

ggplot(PASL_combined, aes(x = Income, y = Count, fill = Stability)) +
  geom_bar(stat = "identity", position = "dodge") +
  ggtitle("Comparing Income Levels with Stability Characteristics (Threshold $50K)") +
  ylab("Number of Households") +
  xlab("Level of Income") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  scale_fill_brewer(palette = "Pastel1")

# ===================== Nutritious Food Spending =====================
nutritious_food_categories <- list(
  FRUIT = c("FRUIT"),
  VEGETABLE = c("VEGETABLE"),
  GRAIN = c("GRAIN"),
  BEAN = c("BEAN"),
  NUT = c("NUT"),
  OAT = c("OAT"),
  ORGANIC = c("ORGANIC")
)

map_nutritious <- function(product_type) {
  if (is.na(product_type)) return(NA)
  upper <- toupper(product_type)
  for(cat in names(nutritious_food_categories)) {
    if(any(sapply(nutritious_food_categories[[cat]], function(x) grepl(x, upper)))) {
      return(cat)
    }
  }
  return(NA)
}

products$category <- sapply(products$product_type, map_nutritious)
products <- products %>% filter(!is.na(category))

merged_data <- transactions %>%
  left_join(demographics %>% select(household_id, in_PASL_income, looks_stable, in_low_income),
            by = "household_id") %>%
  inner_join(products %>% select(product_id, category), by = "product_id") %>%
  mutate(spend = sales_value)

# ===================== Spending by Group =====================
stable_high <- merged_data %>% filter(in_PASL_income & looks_stable)
unstable_high <- merged_data %>% filter(in_PASL_income & !looks_stable)
stable_low <- merged_data %>% filter(in_low_income & looks_stable)

categories <- names(nutritious_food_categories)

group1_spending <- stable_high %>%
  group_by(category) %>%
  summarise(spend = sum(spend, na.rm = TRUE)) %>%
  complete(category = categories, fill = list(spend = 0))

group2_spending <- unstable_high %>%
  group_by(category) %>%
  summarise(spend = sum(spend, na.rm = TRUE)) %>%
  complete(category = categories, fill = list(spend = 0))

group3_spending <- stable_low %>%
  group_by(category) %>%
  summarise(spend = sum(spend, na.rm = TRUE)) %>%
  complete(category = categories, fill = list(spend = 0))

nutritious_plot <- data.frame(
  Category = categories,
  `Stable & Above $50K` = group1_spending$spend,
  `Unstable & Above $50K` = group2_spending$spend,
  `Stable & Below $50K` = group3_spending$spend
)

# ===================== Final Visualization =====================
nutritious_plot_long <- nutritious_plot %>%
  pivot_longer(cols = -Category, names_to = "Household_Type", values_to = "Spend")

ggplot(nutritious_plot_long, aes(x = Category, y = Spend, color = Household_Type, group = Household_Type)) +
  geom_line(size = 1.2) +
  geom_point(size = 3) +
  ggtitle("How Different Household Categories Spend on Nutritious Foods") +
  ylab("Total Spending ($)") +
  xlab("Food Category") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 12)
  ) +
  scale_color_brewer(palette = "Set1")