# ===================== Setup =====================
setwd("C:/Users/manuk/OneDrive/Desktop")
packages <- c("completejourney", "dplyr", "tidyr", "ggplot2")
lapply(packages, function(pkg) {
if (!require(pkg, character.only = TRUE)) install.packages(pkg)
library(pkg, character.only = TRUE)
})
## [[1]]
## [1] "completejourney" "stats" "graphics" "grDevices"
## [5] "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "dplyr" "completejourney" "stats" "graphics"
## [5] "grDevices" "utils" "datasets" "methods"
## [9] "base"
##
## [[3]]
## [1] "tidyr" "dplyr" "completejourney" "stats"
## [5] "graphics" "grDevices" "utils" "datasets"
## [9] "methods" "base"
##
## [[4]]
## [1] "ggplot2" "tidyr" "dplyr" "completejourney"
## [5] "stats" "graphics" "grDevices" "utils"
## [9] "datasets" "methods" "base"
transactions <- get_transactions()
transactions
## # A tibble: 1,469,307 × 11
## household_id store_id basket_id product_id quantity sales_value retail_disc
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 900 330 31198570044 1095275 1 0.5 0
## 2 900 330 31198570047 9878513 1 0.99 0.1
## 3 1228 406 31198655051 1041453 1 1.43 0.15
## 4 906 319 31198705046 1020156 1 1.5 0.29
## 5 906 319 31198705046 1053875 2 2.78 0.8
## 6 906 319 31198705046 1060312 1 5.49 0.5
## 7 906 319 31198705046 1075313 1 1.5 0.29
## 8 1058 381 31198676055 985893 1 1.88 0.21
## 9 1058 381 31198676055 988791 1 1.5 1.29
## 10 1058 381 31198676055 9297106 1 2.69 0
## # ℹ 1,469,297 more rows
## # ℹ 4 more variables: coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>
demographics <- demographics
products <- products
# ===================== Categories =====================
PASL_income <- c("50-74K", "75-99K", "100-124K", "125-149K",
"150-174K", "175-199K", "200-249K", "250K+")
High_income <- c("75-99K", "100-124K", "125-149K",
"150-174K", "175-199K", "200-249K", "250K+")
low_income <- c("Under 15K", "15-24K", "25-34K", "35-49K")
PASL_age <- c("25-34", "35-44", "45-54")
demographics <- demographics %>%
mutate(
kids_count = as.numeric(kids_count),
kids_count = ifelse(is.na(kids_count), 0, kids_count),
in_PASL_income = income %in% PASL_income,
in_High_income = income %in% High_income,
in_low_income = income %in% low_income,
looks_stable = in_High_income | (
(marital_status == "Married") &
(kids_count > 0) &
(age %in% PASL_age) &
(tolower(home_ownership) %in% c("own", "homeowner", "yes"))
)
)
head(demographics)
## # A tibble: 6 × 12
## household_id age income home_ownership marital_status household_size
## <chr> <ord> <ord> <ord> <ord> <ord>
## 1 1 65+ 35-49K Homeowner Married 2
## 2 1001 45-54 50-74K Homeowner Unmarried 1
## 3 1003 35-44 25-34K <NA> Unmarried 1
## 4 1004 25-34 15-24K <NA> Unmarried 1
## 5 101 45-54 Under 15K Homeowner Married 4
## 6 1012 35-44 35-49K <NA> Married 5+
## # ℹ 6 more variables: household_comp <ord>, kids_count <dbl>,
## # in_PASL_income <lgl>, in_High_income <lgl>, in_low_income <lgl>,
## # looks_stable <lgl>
# ===================== Graph 1 — Income above $50K =====================
PASL_table_1 <- demographics %>%
summarise(
Above_50K = sum(in_PASL_income, na.rm = TRUE),
Below_50K = n() - sum(in_PASL_income, na.rm = TRUE)
) %>%
pivot_longer(cols = everything(), names_to = "Category", values_to = "Value") %>%
mutate(Percentage = round(Value / sum(Value) * 100, 1))
ggplot(PASL_table_1, aes(x = "", y = Value, fill = Category)) +
geom_bar(stat = "identity", width = 1, color = "white") +
coord_polar("y", start = 0) +
geom_text(aes(label = paste0(Percentage, "%")), position = position_stack(vjust = 0.5)) +
theme_void() +
ggtitle("Individuals with Income more than $50K") +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
scale_fill_brewer(palette = "Set2")

# ===================== Graph 2 — Stability =====================
PASL_table_2 <- demographics %>%
summarise(
Stable = sum(looks_stable, na.rm = TRUE),
Unstable = n() - sum(looks_stable, na.rm = TRUE)
) %>%
pivot_longer(cols = everything(), names_to = "Category", values_to = "Value") %>%
mutate(Percentage = round(Value / sum(Value) * 100, 1))
ggplot(PASL_table_2, aes(x = "", y = Value, fill = Category)) +
geom_bar(stat = "identity", width = 1, color = "white") +
coord_polar("y", start = 0) +
geom_text(aes(label = paste0(Percentage, "%")), position = position_stack(vjust = 0.5)) +
theme_void() +
ggtitle("Individuals Representing Stability") +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
scale_fill_brewer(palette = "Set3")

# ===================== Graph 3 — Income vs Stability =====================
PASL_combined <- demographics %>%
group_by(in_PASL_income, looks_stable) %>%
summarise(Count = n(), .groups = "drop") %>%
mutate(
Income = ifelse(in_PASL_income, "Above $50K", "Below $50K"),
Stability = ifelse(looks_stable, "Stable", "Unstable")
)
ggplot(PASL_combined, aes(x = Income, y = Count, fill = Stability)) +
geom_bar(stat = "identity", position = "dodge") +
ggtitle("Comparing Income Levels with Stability Characteristics (Threshold $50K)") +
ylab("Number of Households") +
xlab("Level of Income") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
scale_fill_brewer(palette = "Pastel1")

# ===================== Nutritious Food Spending =====================
nutritious_food_categories <- list(
FRUIT = c("FRUIT"),
VEGETABLE = c("VEGETABLE"),
GRAIN = c("GRAIN"),
BEAN = c("BEAN"),
NUT = c("NUT"),
OAT = c("OAT"),
ORGANIC = c("ORGANIC")
)
map_nutritious <- function(product_type) {
if (is.na(product_type)) return(NA)
upper <- toupper(product_type)
for(cat in names(nutritious_food_categories)) {
if(any(sapply(nutritious_food_categories[[cat]], function(x) grepl(x, upper)))) {
return(cat)
}
}
return(NA)
}
products$category <- sapply(products$product_type, map_nutritious)
products <- products %>% filter(!is.na(category))
merged_data <- transactions %>%
left_join(demographics %>% select(household_id, in_PASL_income, looks_stable, in_low_income),
by = "household_id") %>%
inner_join(products %>% select(product_id, category), by = "product_id") %>%
mutate(spend = sales_value)
# ===================== Spending by Group =====================
stable_high <- merged_data %>% filter(in_PASL_income & looks_stable)
unstable_high <- merged_data %>% filter(in_PASL_income & !looks_stable)
stable_low <- merged_data %>% filter(in_low_income & looks_stable)
categories <- names(nutritious_food_categories)
group1_spending <- stable_high %>%
group_by(category) %>%
summarise(spend = sum(spend, na.rm = TRUE)) %>%
complete(category = categories, fill = list(spend = 0))
group2_spending <- unstable_high %>%
group_by(category) %>%
summarise(spend = sum(spend, na.rm = TRUE)) %>%
complete(category = categories, fill = list(spend = 0))
group3_spending <- stable_low %>%
group_by(category) %>%
summarise(spend = sum(spend, na.rm = TRUE)) %>%
complete(category = categories, fill = list(spend = 0))
nutritious_plot <- data.frame(
Category = categories,
`Stable & Above $50K` = group1_spending$spend,
`Unstable & Above $50K` = group2_spending$spend,
`Stable & Below $50K` = group3_spending$spend
)
# ===================== Final Visualization =====================
nutritious_plot_long <- nutritious_plot %>%
pivot_longer(cols = -Category, names_to = "Household_Type", values_to = "Spend")
ggplot(nutritious_plot_long, aes(x = Category, y = Spend, color = Household_Type, group = Household_Type)) +
geom_line(size = 1.2) +
geom_point(size = 3) +
ggtitle("How Different Household Categories Spend on Nutritious Foods") +
ylab("Total Spending ($)") +
xlab("Food Category") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 12)
) +
scale_color_brewer(palette = "Set1")
