1 Setup & Libraries

# ── Core data manipulation ──────────────────────────────────────────────────
library(tidyverse)
library(lubridate)
library(scales)

# ── Visualisation ───────────────────────────────────────────────────────────
library(ggplot2)
library(ggthemes)
library(ggrepel)
library(patchwork)
library(RColorBrewer)
library(viridis)
library(treemapify)   # treemap plots

# ── Tables ──────────────────────────────────────────────────────────────────
library(knitr)
library(kableExtra)
library(DT)           # interactive HTML tables

# ── Unsupervised Learning ────────────────────────────────────────────────────
library(cluster)      # k-means, PAM, silhouette
library(factoextra)   # cluster visualisation
library(NbClust)      # optimal k selection
library(corrplot)     # correlation matrix

# ── Supervised Learning ──────────────────────────────────────────────────────
library(caret)        # train/test + CV framework
library(randomForest) # Random Forest classifier / regressor
library(xgboost)      # Gradient Boosting
library(rpart)        # Decision Trees
library(rpart.plot)   # Decision Tree visualisation
library(pROC)         # ROC / AUC
library(ROSE)         # class imbalance (ROSE / SMOTE)

# ── Text ─────────────────────────────────────────────────────────────────────
library(tidytext)
library(wordcloud)
library(SnowballC)

2 Data Loading

# ── Adjust this path to wherever your CSVs live ─────────────────────────────
DATA_PATH <- "C:/Users/PAYTAKHT/Desktop/spinaker/data/"

customers     <- read_csv(paste0(DATA_PATH, "olist_customers_dataset.csv"))
geolocation   <- read_csv(paste0(DATA_PATH, "olist_geolocation_dataset.csv"))
order_items   <- read_csv(paste0(DATA_PATH, "olist_order_items_dataset.csv"))
order_payments<- read_csv(paste0(DATA_PATH, "olist_order_payments_dataset.csv"))
order_reviews <- read_csv(paste0(DATA_PATH, "olist_order_reviews_dataset.csv"))
orders        <- read_csv(paste0(DATA_PATH, "olist_orders_dataset.csv"))
products      <- read_csv(paste0(DATA_PATH, "olist_products_dataset.csv"))
sellers       <- read_csv(paste0(DATA_PATH, "olist_sellers_dataset.csv"))
category_trans<- read_csv(paste0(DATA_PATH, "product_category_name_translation.csv"))

cat("Datasets loaded successfully.\n")
## Datasets loaded successfully.
cat(sprintf("Orders: %s rows | Products: %s rows | Items: %s rows\n",
            nrow(orders), nrow(products), nrow(order_items)))
## Orders: 99441 rows | Products: 32951 rows | Items: 112650 rows

3 Data Wrangling & Feature Engineering

# ── 1. Translate product categories to English ───────────────────────────────
products <- products %>%
  left_join(category_trans, by = "product_category_name") %>%
  mutate(
    category_en = coalesce(product_category_name_english, "unknown")
  )

# ── 2. Parse all date columns ────────────────────────────────────────────────
orders <- orders %>%
  mutate(across(contains("date") | contains("timestamp"),
                ~ymd_hms(.x, quiet = TRUE)))

# ── 3. Master order-level table ──────────────────────────────────────────────
orders_full <- orders %>%
  left_join(customers,      by = "customer_id") %>%
  left_join(order_payments  %>%
              group_by(order_id) %>%
              summarise(
                total_payment   = sum(payment_value, na.rm = TRUE),
                n_installments  = max(payment_installments, na.rm = TRUE),
                payment_type    = first(payment_type),
                .groups = "drop"),
            by = "order_id") %>%
  left_join(order_reviews %>%
              group_by(order_id) %>%
              slice_max(review_score, with_ties = FALSE) %>%
              select(order_id, review_score),
            by = "order_id") %>%
  mutate(
    delivery_days = as.numeric(
      difftime(order_delivered_customer_date,
               order_purchase_timestamp, units = "days")),
    delay_days = as.numeric(
      difftime(order_delivered_customer_date,
               order_estimated_delivery_date, units = "days")),
    is_late        = delay_days > 0,
    purchase_year  = year(order_purchase_timestamp),
    purchase_month = month(order_purchase_timestamp, label = TRUE),
    purchase_dow   = wday(order_purchase_timestamp, label = TRUE)
  )

# ── 4. Item-level enriched table ─────────────────────────────────────────────
items_full <- order_items %>%
  left_join(products %>%
              select(product_id, category_en,
                     product_weight_g, product_photos_qty),
            by = "product_id") %>%
  left_join(orders %>%
              select(order_id, order_status, order_purchase_timestamp),
            by = "order_id") %>%
  left_join(sellers %>%
              select(seller_id, seller_state),
            by = "seller_id") %>%
  mutate(
    revenue = price + freight_value,
    category_en = replace_na(category_en, "unknown")
  )

# ── 5. Product-level summary (sales + "unsold" flag) ────────────────────────
delivered_items <- items_full %>%
  filter(order_status == "delivered")

product_sales <- products %>%
  left_join(
    delivered_items %>%
      group_by(product_id) %>%
      summarise(
        units_sold    = n(),
        total_revenue = sum(revenue, na.rm = TRUE),
        avg_price     = mean(price, na.rm = TRUE),
        n_orders      = n_distinct(order_id),
        .groups = "drop"),
    by = "product_id") %>%
  mutate(
    units_sold    = replace_na(units_sold, 0),
    total_revenue = replace_na(total_revenue, 0),
    avg_price     = replace_na(avg_price, 0),
    n_orders      = replace_na(n_orders, 0),
    sold_flag     = units_sold > 0
  )

cat("Wrangling done.\n")
## Wrangling done.
cat(sprintf("Total products: %d  |  Sold: %d  |  Never sold: %d\n",
            nrow(product_sales),
            sum(product_sales$sold_flag),
            sum(!product_sales$sold_flag)))
## Total products: 32951  |  Sold: 32216  |  Never sold: 735

4 Exploratory Data Analysis

4.1 Overview Dashboard

kpi <- tibble(
  Metric  = c("Total Orders", "Delivered Orders",
              "Total Revenue (R$)", "Unique Products",
              "Active Sellers", "Avg Review Score"),
  Value   = c(
    comma(nrow(orders)),
    comma(sum(orders$order_status == "delivered", na.rm = TRUE)),
    dollar(sum(order_payments$payment_value, na.rm = TRUE),
           prefix = "R$ ", big.mark = ","),
    comma(nrow(products)),
    comma(nrow(sellers)),
    round(mean(order_reviews$review_score, na.rm = TRUE), 2)
  )
)

kpi %>%
  kbl(caption = "Key Business Metrics") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE) %>%
  column_spec(2, bold = TRUE, color = "#2c7bb6")
Key Business Metrics
Metric Value
Total Orders 99,441
Delivered Orders 96,478
Total Revenue (R\() </td> <td style="text-align:left;font-weight: bold;color: rgba(44, 123, 182, 255) !important;"> R\) 16,008,872
Unique Products 32,951
Active Sellers 3,095
Avg Review Score 4.09

4.2 Sales Timeline

monthly_revenue <- order_payments %>%
  left_join(orders %>% select(order_id, order_purchase_timestamp),
            by = "order_id") %>%
  filter(!is.na(order_purchase_timestamp)) %>%
  mutate(month = floor_date(order_purchase_timestamp, "month")) %>%
  group_by(month) %>%
  summarise(revenue = sum(payment_value, na.rm = TRUE),
            n_orders = n_distinct(order_id), .groups = "drop") %>%
  filter(between(month, as.Date("2017-01-01"), as.Date("2018-09-01")))

p1 <- ggplot(monthly_revenue, aes(month, revenue)) +
  geom_area(fill = "#3182bd", alpha = .3) +
  geom_line(color = "#3182bd", linewidth = 1) +
  geom_point(color = "#3182bd", size = 2) +
  scale_y_continuous(labels = dollar_format(prefix = "R$ ", big.mark = ",")) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "2 months") +
  labs(title = "Monthly Revenue", x = NULL, y = "Revenue (R$)") +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p2 <- ggplot(monthly_revenue, aes(month, n_orders)) +
  geom_col(fill = "#e6550d", alpha = .8) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "2 months") +
  scale_y_continuous(labels = comma) +
  labs(title = "Monthly Order Volume", x = NULL, y = "# Orders") +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p1 / p2

4.3 Order Status Distribution

orders %>%
  count(order_status) %>%
  mutate(pct = n / sum(n),
         order_status = fct_reorder(order_status, n)) %>%
  ggplot(aes(order_status, n, fill = order_status)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = percent(pct, .1)), hjust = -.1, size = 3.5) +
  coord_flip() +
  scale_fill_brewer(palette = "Set2") +
  scale_y_continuous(expand = expansion(mult = c(0, .15))) +
  labs(title = "Order Status Distribution",
       x = NULL, y = "# Orders") +
  theme_minimal(base_size = 12)


5 Q1 — Which Products Are Most Often Purchased?

5.1 Top Categories by Units Sold

top_cat <- delivered_items %>%
  count(category_en, wt = NULL, name = "units_sold") %>%
  mutate(revenue = (delivered_items %>%
                      group_by(category_en) %>%
                      summarise(r = sum(revenue)))[["r"]][
                        match(category_en, (delivered_items %>%
                                              group_by(category_en) %>%
                                              summarise(r = sum(revenue)))[["category_en"]])]) %>%
  arrange(desc(units_sold)) %>%
  slice_head(n = 20)

# Re-compute cleanly
top_cat <- delivered_items %>%
  group_by(category_en) %>%
  summarise(units_sold    = n(),
            total_revenue = sum(revenue, na.rm = TRUE),
            avg_price     = mean(price, na.rm = TRUE),
            .groups = "drop") %>%
  arrange(desc(units_sold)) %>%
  slice_head(n = 20)

ggplot(top_cat, aes(reorder(category_en, units_sold), units_sold,
                    fill = total_revenue)) +
  geom_col() +
  coord_flip() +
  scale_fill_viridis_c(labels = dollar_format(prefix = "R$ ", big.mark = ","),
                       name   = "Total Revenue") +
  scale_y_continuous(labels = comma) +
  labs(title = "Top 20 Product Categories by Units Sold",
       subtitle = "Color = Total Revenue",
       x = NULL, y = "Units Sold") +
  theme_minimal(base_size = 12)

5.2 Revenue vs. Volume Quadrant Chart

cat_summary <- delivered_items %>%
  group_by(category_en) %>%
  summarise(units_sold    = n(),
            total_revenue = sum(revenue, na.rm = TRUE),
            avg_price     = mean(price, na.rm = TRUE),
            .groups = "drop") %>%
  filter(units_sold >= 50)   # remove noise from tiny categories

med_units   <- median(cat_summary$units_sold)
med_revenue <- median(cat_summary$total_revenue)

cat_summary <- cat_summary %>%
  mutate(quadrant = case_when(
    units_sold >= med_units & total_revenue >= med_revenue ~ "Stars",
    units_sold >= med_units & total_revenue <  med_revenue ~ "Volume Leaders",
    units_sold <  med_units & total_revenue >= med_revenue ~ "Premium Niche",
    TRUE                                                   ~ "Underperformers"
  ))

ggplot(cat_summary, aes(units_sold, total_revenue,
                        color = quadrant, size = avg_price)) +
  geom_point(alpha = .75) +
  geom_vline(xintercept = med_units,   linetype = "dashed", color = "grey50") +
  geom_hline(yintercept = med_revenue, linetype = "dashed", color = "grey50") +
  geom_text_repel(data = cat_summary %>% slice_max(units_sold, n = 10),
                  aes(label = category_en), size = 3, show.legend = FALSE) +
  scale_x_log10(labels = comma) +
  scale_y_log10(labels = dollar_format(prefix = "R$ ", big.mark = ",")) +
  scale_color_brewer(palette = "Set1") +
  labs(title    = "Product Category Quadrant Analysis",
       subtitle = "Axes are on log scale; size = average unit price",
       x = "Units Sold (log)", y = "Total Revenue (log)",
       color = "Quadrant", size = "Avg Price (R$)") +
  theme_minimal(base_size = 12)

5.3 Treemap of Revenue by Category

delivered_items %>%
  group_by(category_en) %>%
  summarise(total_revenue = sum(revenue, na.rm = TRUE),
            units_sold    = n(), .groups = "drop") %>%
  slice_max(total_revenue, n = 30) %>%
  ggplot(aes(area = total_revenue, fill = units_sold,
             label = paste0(category_en, "\n",
                            dollar(total_revenue, prefix = "R$", scale = 1e-3,
                                   suffix = "K", big.mark = ",")))) +
  geom_treemap() +
  geom_treemap_text(colour = "white", place = "centre", grow = FALSE, size = 10) +
  scale_fill_viridis_c(name = "Units Sold", labels = comma) +
  labs(title = "Revenue Treemap – Top 30 Categories") +
  theme_minimal()

5.4 Top 20 Individual Products

top_products <- delivered_items %>%
  group_by(product_id) %>%
  summarise(units_sold    = n(),
            total_revenue = sum(revenue, na.rm = TRUE),
            avg_price     = mean(price, na.rm = TRUE),
            .groups = "drop") %>%
  left_join(products %>% select(product_id, category_en),
            by = "product_id") %>%
  arrange(desc(units_sold)) %>%
  slice_head(n = 20)

top_products %>%
  select(product_id, category_en, units_sold, total_revenue, avg_price) %>%
  mutate(total_revenue = dollar(total_revenue, prefix = "R$ ", big.mark = ","),
         avg_price     = dollar(avg_price,     prefix = "R$ ")) %>%
  datatable(caption  = "Top 20 Best-Selling Products",
            options  = list(pageLength = 10),
            rownames = FALSE,
            colnames = c("Product ID","Category","Units Sold",
                         "Total Revenue","Avg Price"))

5.5 Sales by Day-of-Week & Hour

orders_full %>%
  filter(order_status == "delivered") %>%
  mutate(hour = hour(order_purchase_timestamp),
         dow  = wday(order_purchase_timestamp, label = TRUE, week_start = 1)) %>%
  count(dow, hour) %>%
  ggplot(aes(hour, dow, fill = n)) +
  geom_tile(color = "white") +
  scale_fill_viridis_c(name = "# Orders", labels = comma) +
  scale_x_continuous(breaks = 0:23) +
  labs(title = "Purchase Heatmap: Day of Week × Hour",
       x = "Hour of Day", y = NULL) +
  theme_minimal(base_size = 12) +
  theme(panel.grid = element_blank())


6 Q2 — Products That Did NOT Find Their Customers

6.1 Unsold Products Overview

sold_summary <- product_sales %>%
  count(sold_flag) %>%
  mutate(pct   = n / sum(n),
         label = ifelse(sold_flag, "Sold", "Never Sold"))

ggplot(sold_summary, aes(x = 2, y = pct, fill = label)) +
  geom_col(width = 1, color = "white") +
  coord_polar(theta = "y", start = 0) +
  xlim(.5, 2.5) +
  geom_text(aes(label = paste0(label, "\n", percent(pct, .1))),
            position = position_stack(vjust = .5), size = 5) +
  scale_fill_manual(values = c("Never Sold" = "#d7301f",
                               "Sold"       = "#41ab5d")) +
  labs(title = "Product Catalogue: Sold vs. Never Sold") +
  theme_void() +
  theme(legend.position = "none")

6.2 Unsold Products by Category

# categories with the most unsold products
unsold_cat <- product_sales %>%
  group_by(category_en) %>%
  summarise(total     = n(),
            unsold    = sum(!sold_flag),
            sold      = sum(sold_flag),
            unsold_pct= unsold / total,
            .groups   = "drop") %>%
  filter(total >= 10) %>%
  arrange(desc(unsold_pct))

# Top 15 categories by unsold %
p_unsold_pct <- unsold_cat %>%
  slice_head(n = 15) %>%
  ggplot(aes(reorder(category_en, unsold_pct), unsold_pct, fill = unsold)) +
  geom_col() +
  coord_flip() +
  scale_fill_gradient(low = "#fdd0a2", high = "#d7301f", name = "# Unsold") +
  scale_y_continuous(labels = percent) +
  labs(title = "Top 15 Categories: Highest Unsold Rate",
       x = NULL, y = "Unsold Rate") +
  theme_minimal(base_size = 11)

# Top 15 categories by absolute # unsold
p_unsold_abs <- unsold_cat %>%
  slice_max(unsold, n = 15) %>%
  ggplot(aes(reorder(category_en, unsold), unsold, fill = unsold_pct)) +
  geom_col() +
  coord_flip() +
  scale_fill_gradient(low = "#fdd0a2", high = "#d7301f",
                      labels = percent, name = "Unsold %") +
  scale_y_continuous(labels = comma) +
  labs(title = "Top 15 Categories: Most Unsold Products (Absolute)",
       x = NULL, y = "# Unsold Products") +
  theme_minimal(base_size = 11)

p_unsold_pct / p_unsold_abs

6.3 Comparing Sold vs. Unsold Product Characteristics

compare_df <- product_sales %>%
  filter(!is.na(product_weight_g)) %>%
  mutate(group = ifelse(sold_flag, "Sold", "Never Sold"))

# Weight distribution
p_w <- ggplot(compare_df, aes(product_weight_g, fill = group)) +
  geom_density(alpha = .5) +
  scale_x_log10(labels = comma) +
  scale_fill_manual(values = c("Never Sold" = "#d7301f", "Sold" = "#41ab5d")) +
  labs(title = "Product Weight Distribution",
       x = "Weight (g, log scale)", fill = NULL) +
  theme_minimal(base_size = 11)

# Photo count
p_ph <- compare_df %>%
  filter(!is.na(product_photos_qty)) %>%
  ggplot(aes(factor(pmin(product_photos_qty, 10)), fill = group)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = c("Never Sold" = "#d7301f", "Sold" = "#41ab5d")) +
  scale_y_continuous(labels = percent) +
  labs(title = "Photo Count (capped @ 10)",
       x = "# Product Photos", y = "Proportion", fill = NULL) +
  theme_minimal(base_size = 11)

(p_w + p_ph + plot_layout(guides = "collect")) &
  theme(legend.position = "bottom")

6.4 Unsold Product Table

product_sales %>%
  filter(!sold_flag) %>%
  select(product_id, category_en,
         product_weight_g, product_photos_qty) %>%
  arrange(category_en) %>%
  datatable(caption  = "Never-Sold Products",
            options  = list(pageLength = 10, scrollX = TRUE),
            rownames = FALSE,
            colnames = c("Product ID","Category","Weight (g)","# Photos"))

7 Unsupervised Learning — Customer Segmentation (RFM + K-Means)

7.1 Build RFM Features

reference_date <- as.Date("2018-10-18")   # day after last purchase in dataset

rfm <- orders_full %>%
  filter(order_status == "delivered",
         !is.na(total_payment),
         total_payment > 0) %>%                        # guard: no zero monetary
  group_by(customer_unique_id) %>%
  summarise(
    recency   = as.numeric(reference_date - max(as.Date(order_purchase_timestamp),
                                                na.rm = TRUE)),
    frequency = n_distinct(order_id),
    monetary  = sum(total_payment, na.rm = TRUE),
    .groups   = "drop"
  ) %>%
  # drop any row where a value is NA, 0, negative, or Inf
  filter(
    if_all(c(recency, frequency, monetary),
           ~ !is.na(.) & is.finite(.) & . > 0)
  )

# Log-transform -> scale -> drop any column with zero variance
rfm_log <- rfm %>%
  mutate(across(c(recency, frequency, monetary), log1p)) %>%
  select(recency, frequency, monetary)

# scale() returns a matrix; keep as plain numeric matrix for kmeans
rfm_scaled <- scale(rfm_log)

# Final safety: remove rows still NA/NaN/Inf after scaling
# (can happen if a column has zero variance -> sd=0 -> division by 0)
ok_rows    <- complete.cases(rfm_scaled) &
              apply(rfm_scaled, 1, function(x) all(is.finite(x)))
rfm_scaled <- rfm_scaled[ok_rows, ]
rfm        <- rfm[ok_rows, ]

cat(sprintf("RFM table: %d customers (after cleaning)\n", nrow(rfm)))
## RFM table: 93356 customers (after cleaning)
cat(sprintf("NA in rfm_scaled: %d | Inf: %d\n",
            sum(is.na(rfm_scaled)), sum(!is.finite(rfm_scaled))))
## NA in rfm_scaled: 0 | Inf: 0
summary(rfm %>% select(recency, frequency, monetary))
##     recency        frequency         monetary       
##  Min.   : 50.0   Min.   : 1.000   Min.   :    9.59  
##  1st Qu.:164.0   1st Qu.: 1.000   1st Qu.:   63.06  
##  Median :268.0   Median : 1.000   Median :  107.78  
##  Mean   :287.5   Mean   : 1.033   Mean   :  165.20  
##  3rd Qu.:396.0   3rd Qu.: 1.000   3rd Qu.:  182.57  
##  Max.   :745.0   Max.   :15.000   Max.   :13664.08

7.2 Correlation Matrix

corrplot(cor(rfm %>% select(recency, frequency, monetary)),
         method = "color", addCoef.col = "black",
         tl.col = "black", tl.srt = 45,
         title  = "RFM Feature Correlations",
         mar    = c(0,0,2,0))

7.3 Optimal Number of Clusters

set.seed(42)

# fviz_nbclust iterates kmeans for every k on the full matrix -> O(n * k) RAM.
# With ~90k customers that blows memory. Sample 3 000 rows for diagnostics only;
# the final kmeans() below still runs on the full rfm_scaled.
SAMPLE_N   <- 3000
sample_idx <- sample(nrow(rfm_scaled), min(SAMPLE_N, nrow(rfm_scaled)))
rfm_sample <- rfm_scaled[sample_idx, ]

# ── Elbow (WSS) ──────────────────────────────────────────────────────────────
wss_vals <- map_dbl(1:10, function(k) {
  km <- kmeans(rfm_sample, centers = k, nstart = 10, iter.max = 100)
  km$tot.withinss
})

tibble(k = 1:10, wss = wss_vals) %>%
  ggplot(aes(k, wss)) +
  geom_line(color = "#3182bd", linewidth = 1) +
  geom_point(color = "#3182bd", size = 3) +
  scale_x_continuous(breaks = 1:10) +
  labs(title    = "Elbow Method — Optimal k for K-Means",
       subtitle = paste0("Computed on a random sample of ",
                         comma(nrow(rfm_sample)), " customers"),
       x = "Number of clusters k", y = "Total Within-Cluster SS") +
  theme_minimal(base_size = 12)

# ── Silhouette ───────────────────────────────────────────────────────────────
# Use a smaller sample for silhouette (O(n^2) distance matrix)
SIL_N      <- 1500
sil_idx    <- sample(nrow(rfm_scaled), min(SIL_N, nrow(rfm_scaled)))
rfm_sil    <- rfm_scaled[sil_idx, ]

sil_vals <- map_dbl(2:10, function(k) {
  km  <- kmeans(rfm_sil, centers = k, nstart = 10, iter.max = 100)
  ss  <- silhouette(km$cluster, dist(rfm_sil))
  mean(ss[, "sil_width"])
})

tibble(k = 2:10, sil = sil_vals) %>%
  ggplot(aes(k, sil)) +
  geom_line(color = "#e6550d", linewidth = 1) +
  geom_point(aes(color = sil == max(sil)), size = 3, show.legend = FALSE) +
  scale_color_manual(values = c("FALSE" = "#e6550d", "TRUE" = "#31a354")) +
  scale_x_continuous(breaks = 2:10) +
  labs(title    = "Silhouette Method — Optimal k",
       subtitle = paste0("Green point = best k | sample of ",
                         comma(nrow(rfm_sil)), " customers"),
       x = "Number of clusters k", y = "Average Silhouette Width") +
  theme_minimal(base_size = 12)

7.4 K-Means Clustering (k = 4)

set.seed(42)
K <- 4
km_fit <- kmeans(rfm_scaled, centers = K, nstart = 50, iter.max = 300)

rfm <- rfm %>% mutate(cluster = factor(km_fit$cluster))

cat("Cluster sizes:\n")
## Cluster sizes:
print(table(rfm$cluster))
## 
##     1     2     3     4 
## 27886 37861 24808  2801

7.5 Cluster Visualisation

# PCA for 2-D projection
fviz_cluster(km_fit, data = rfm_scaled,
             geom = "point", ellipse.type = "convex",
             palette = "Set1", ggtheme = theme_minimal(base_size = 12),
             main = "K-Means Customer Clusters (PCA Projection)")

7.6 Cluster Profiles

cluster_profile <- rfm %>%
  group_by(cluster) %>%
  summarise(
    n_customers  = n(),
    avg_recency  = round(mean(recency)),
    avg_frequency= round(mean(frequency), 2),
    avg_monetary = round(mean(monetary), 2),
    .groups = "drop"
  ) %>%
  mutate(
    segment_label = case_when(
      avg_recency <  median(rfm$recency) & avg_frequency > 1 ~ "Champions",
      avg_recency <  median(rfm$recency)                      ~ "Promising",
      avg_monetary > quantile(rfm$monetary, .75)              ~ "At-Risk High Value",
      TRUE                                                    ~ "Hibernating"
    )
  )

cluster_profile %>%
  kbl(caption = "RFM Cluster Summary",
      col.names = c("Cluster","# Customers","Avg Recency (days)",
                    "Avg Frequency","Avg Monetary (R$)","Segment")) %>%
  kable_styling(bootstrap_options = c("striped","hover"),
                full_width = FALSE) %>%
  column_spec(6, bold = TRUE, color = "white",
              background = c("#3182bd","#41ab5d","#d7301f","#fd8d3c"))
RFM Cluster Summary
Cluster # Customers Avg Recency (days) Avg Frequency Avg Monetary (R$) Segment
1 27886 335 1.00 318.87 At-Risk High Value
2 37861 366 1.00 69.36 Hibernating
3 24808 117 1.00 122.54 Promising
4 2801 270 2.11 308.59 At-Risk High Value