# Load libraries and data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(viridis)
## Loading required package: viridisLite
## 
## Attaching package: 'viridis'
## 
## The following object is masked from 'package:scales':
## 
##     viridis_pal
library(ggrepel)

# Load the DataMart data
datamart <- read_csv("datamart_customer_analytics.csv")
## Rows: 2240 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): Education, Marital_Status, Dt_Customer
## dbl (26): ID, Year_Birth, Income, Kidhome, Teenhome, Recency, MntWines, MntF...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Clean & prep
datamart <- datamart %>%
  filter(!is.na(Income)) %>%
  mutate(
    total_spending = rowSums(select(., starts_with("Mnt")), na.rm = TRUE),
    total_purchases = NumWebPurchases + NumCatalogPurchases + NumStorePurchases,
    recency = Recency,
    Complain = factor(Complain, labels = c("No","Yes"))
  )

cat("Dataset: ", nrow(datamart), "customers, ", 
    ncol(datamart), "variables\n")
## Dataset:  2216 customers,  32 variables
# Show relevant variables for our question
relevant_vars <- c("Education", "Income", "Marital_Status", "Kidhome", "Teenhome", "NumWebPurchases", "NumCatalogPurchases", "NumStorePurchases")
datamart %>% select(all_of(relevant_vars)) %>% head()
## # A tibble: 6 × 8
##   Education  Income Marital_Status Kidhome Teenhome NumWebPurchases
##   <chr>       <dbl> <chr>            <dbl>    <dbl>           <dbl>
## 1 Graduation  58138 Single               0        0               8
## 2 Graduation  46344 Single               1        1               1
## 3 Graduation  71613 Together             0        0               8
## 4 Graduation  26646 Together             1        0               2
## 5 PhD         58293 Married              1        0               5
## 6 Master      62513 Together             0        1               6
## # ℹ 2 more variables: NumCatalogPurchases <dbl>, NumStorePurchases <dbl>
# Our first visualization attempt

# Calculate average purchases per channel
channel_means <- datamart %>%
  summarise(
    Store = mean(NumStorePurchases),
    Web = mean(NumWebPurchases),
    Catalog = mean(NumCatalogPurchases)
  ) %>%
  pivot_longer(cols = everything(), names_to = "Channel", values_to = "AvgPurchases")

# Create the plot
plot_v1 <- ggplot(channel_means, aes(x = Channel, y = AvgPurchases, fill = Channel)) +
  geom_col() +
  labs(
    title = "First Attempt: Average Purchases Across Channels",
    subtitle = "Identifying issues with this approach",
    x = "Purchase Channel",
    y = "Average Number of Purchases"
  ) +
  theme_minimal()

print(plot_v1)

# Improved version addressing the main issue

# Calculate average purchases by Education level
edu_channel <- datamart %>%
  group_by(Education) %>%
  summarise(
    Store = mean(NumStorePurchases),
    Web = mean(NumWebPurchases),
    Catalog = mean(NumCatalogPurchases)
  ) %>%
  pivot_longer(cols = c(Store, Web, Catalog), names_to = "Channel", values_to = "AvgPurchases")

# Create the plot
plot_v2 <- ggplot(edu_channel, aes(x = Education, y = AvgPurchases, group = Channel, color = Channel)) +
  geom_line(size = 1.2, aes(linetype = Channel)) + # Added linetype for better distinction in B&W
  geom_point(size = 2) +
  labs(
    title = "Iteration 1: Channel Preferences by Education Level",
    subtitle = "How this addresses the problem of customer segmentation",
    x = "Education Level",
    y = "Average Number of Purchases",
    color = "Channel",
    linetype = "Channel"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(plot_v2)

# Further refinements

# Use the same data as Iteration 1
plot_v3 <- ggplot(edu_channel, aes(x = Education, y = AvgPurchases, group = 1, fill = Channel)) +
  geom_col() + # Using bars for a different aesthetic within facets
  facet_wrap(~ Channel, scales = "free_y", ncol = 1) + # This is the key change: separate facets with free y-scales
  labs(
    title = "Iteration 2: Channel Usage by Education (Faceted View)",
    subtitle = "Separate scales for each channel reveal hidden patterns",
    x = "Education Level",
    y = "Average Number of Purchases",
    fill = "Channel"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(plot_v3)

## Rows: 2240 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): Education, Marital_Status, Dt_Customer
## dbl (26): ID, Year_Birth, Income, Kidhome, Teenhome, Recency, MntWines, MntF...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Dataset:  2216 customers,  32 variables
## # A tibble: 6 × 8
##   Education  Income Marital_Status Kidhome Teenhome NumWebPurchases
##   <chr>       <dbl> <chr>            <dbl>    <dbl>           <dbl>
## 1 Graduation  58138 Single               0        0               8
## 2 Graduation  46344 Single               1        1               1
## 3 Graduation  71613 Together             0        0               8
## 4 Graduation  26646 Together             1        0               2
## 5 PhD         58293 Married              1        0               5
## 6 Master      62513 Together             0        1               6
## # ℹ 2 more variables: NumCatalogPurchases <dbl>, NumStorePurchases <dbl>