1 Setup & Libraries

# ── Core data manipulation ──────────────────────────────────────────────────
library(tidyverse)
library(lubridate)
library(scales)

# ── Visualisation ───────────────────────────────────────────────────────────
library(ggplot2)
library(ggthemes)
library(ggrepel)
library(patchwork)
library(RColorBrewer)
library(viridis)
library(treemapify)   # treemap plots

# ── Tables ──────────────────────────────────────────────────────────────────
library(knitr)
library(kableExtra)
library(DT)           # interactive HTML tables

# ── Unsupervised Learning ────────────────────────────────────────────────────
library(cluster)      # k-means, PAM, silhouette
library(factoextra)   # cluster visualisation
library(NbClust)      # optimal k selection
library(corrplot)     # correlation matrix

# ── Supervised Learning ──────────────────────────────────────────────────────
library(caret)        # train/test + CV framework
library(randomForest) # Random Forest classifier / regressor
library(xgboost)      # Gradient Boosting
library(rpart)        # Decision Trees
library(rpart.plot)   # Decision Tree visualisation
library(pROC)         # ROC / AUC
library(ROSE)         # class imbalance (ROSE / SMOTE)

# ── Text ─────────────────────────────────────────────────────────────────────
library(tidytext)
library(wordcloud)
library(SnowballC)

2 Data Loading

# ── Adjust this path to wherever your CSVs live ─────────────────────────────
DATA_PATH <- "C:/Users/PAYTAKHT/Desktop/spinaker/data/"

customers     <- read_csv(paste0(DATA_PATH, "olist_customers_dataset.csv"))
geolocation   <- read_csv(paste0(DATA_PATH, "olist_geolocation_dataset.csv"))
order_items   <- read_csv(paste0(DATA_PATH, "olist_order_items_dataset.csv"))
order_payments<- read_csv(paste0(DATA_PATH, "olist_order_payments_dataset.csv"))
order_reviews <- read_csv(paste0(DATA_PATH, "olist_order_reviews_dataset.csv"))
orders        <- read_csv(paste0(DATA_PATH, "olist_orders_dataset.csv"))
products      <- read_csv(paste0(DATA_PATH, "olist_products_dataset.csv"))
sellers       <- read_csv(paste0(DATA_PATH, "olist_sellers_dataset.csv"))
category_trans<- read_csv(paste0(DATA_PATH, "product_category_name_translation.csv"))

cat("Datasets loaded successfully.\n")

## Datasets loaded successfully.

cat(sprintf("Orders: %s rows | Products: %s rows | Items: %s rows\n",
            nrow(orders), nrow(products), nrow(order_items)))

## Orders: 99441 rows | Products: 32951 rows | Items: 112650 rows

3 Data Wrangling & Feature Engineering

# ── 1. Translate product categories to English ───────────────────────────────
products <- products %>%
  left_join(category_trans, by = "product_category_name") %>%
  mutate(
    category_en = coalesce(product_category_name_english, "unknown")
  )

# ── 2. Parse all date columns ────────────────────────────────────────────────
orders <- orders %>%
  mutate(across(contains("date") | contains("timestamp"),
                ~ymd_hms(.x, quiet = TRUE)))

# ── 3. Master order-level table ──────────────────────────────────────────────
orders_full <- orders %>%
  left_join(customers,      by = "customer_id") %>%
  left_join(order_payments  %>%
              group_by(order_id) %>%
              summarise(
                total_payment   = sum(payment_value, na.rm = TRUE),
                n_installments  = max(payment_installments, na.rm = TRUE),
                payment_type    = first(payment_type),
                .groups = "drop"),
            by = "order_id") %>%
  left_join(order_reviews %>%
              group_by(order_id) %>%
              slice_max(review_score, with_ties = FALSE) %>%
              select(order_id, review_score),
            by = "order_id") %>%
  mutate(
    delivery_days = as.numeric(
      difftime(order_delivered_customer_date,
               order_purchase_timestamp, units = "days")),
    delay_days = as.numeric(
      difftime(order_delivered_customer_date,
               order_estimated_delivery_date, units = "days")),
    is_late        = delay_days > 0,
    purchase_year  = year(order_purchase_timestamp),
    purchase_month = month(order_purchase_timestamp, label = TRUE),
    purchase_dow   = wday(order_purchase_timestamp, label = TRUE)
  )

# ── 4. Item-level enriched table ─────────────────────────────────────────────
items_full <- order_items %>%
  left_join(products %>%
              select(product_id, category_en,
                     product_weight_g, product_photos_qty),
            by = "product_id") %>%
  left_join(orders %>%
              select(order_id, order_status, order_purchase_timestamp),
            by = "order_id") %>%
  left_join(sellers %>%
              select(seller_id, seller_state),
            by = "seller_id") %>%
  mutate(
    revenue = price + freight_value,
    category_en = replace_na(category_en, "unknown")
  )

# ── 5. Product-level summary (sales + "unsold" flag) ────────────────────────
delivered_items <- items_full %>%
  filter(order_status == "delivered")

product_sales <- products %>%
  left_join(
    delivered_items %>%
      group_by(product_id) %>%
      summarise(
        units_sold    = n(),
        total_revenue = sum(revenue, na.rm = TRUE),
        avg_price     = mean(price, na.rm = TRUE),
        n_orders      = n_distinct(order_id),
        .groups = "drop"),
    by = "product_id") %>%
  mutate(
    units_sold    = replace_na(units_sold, 0),
    total_revenue = replace_na(total_revenue, 0),
    avg_price     = replace_na(avg_price, 0),
    n_orders      = replace_na(n_orders, 0),
    sold_flag     = units_sold > 0
  )

cat("Wrangling done.\n")

## Wrangling done.

cat(sprintf("Total products: %d  |  Sold: %d  |  Never sold: %d\n",
            nrow(product_sales),
            sum(product_sales$sold_flag),
            sum(!product_sales$sold_flag)))

## Total products: 32951  |  Sold: 32216  |  Never sold: 735

4 Exploratory Data Analysis

4.1 Overview Dashboard

kpi <- tibble(
  Metric  = c("Total Orders", "Delivered Orders",
              "Total Revenue (R$)", "Unique Products",
              "Active Sellers", "Avg Review Score"),
  Value   = c(
    comma(nrow(orders)),
    comma(sum(orders$order_status == "delivered", na.rm = TRUE)),
    dollar(sum(order_payments$payment_value, na.rm = TRUE),
           prefix = "R$ ", big.mark = ","),
    comma(nrow(products)),
    comma(nrow(sellers)),
    round(mean(order_reviews$review_score, na.rm = TRUE), 2)
  )
)

kpi %>%
  kbl(caption = "Key Business Metrics") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE) %>%
  column_spec(2, bold = TRUE, color = "#2c7bb6")

Key Business Metrics
Metric	Value
Total Orders	99,441
Delivered Orders	96,478
Total Revenue (R$) </td> <td style="text-align:left;font-weight: bold;color: rgba(44, 123, 182, 255) !important;"> R$ 16,008,872
Unique Products	32,951
Active Sellers	3,095
Avg Review Score	4.09

4.2 Sales Timeline

monthly_revenue <- order_payments %>%
  left_join(orders %>% select(order_id, order_purchase_timestamp),
            by = "order_id") %>%
  filter(!is.na(order_purchase_timestamp)) %>%
  mutate(month = floor_date(order_purchase_timestamp, "month")) %>%
  group_by(month) %>%
  summarise(revenue = sum(payment_value, na.rm = TRUE),
            n_orders = n_distinct(order_id), .groups = "drop") %>%
  filter(between(month, as.Date("2017-01-01"), as.Date("2018-09-01")))

p1 <- ggplot(monthly_revenue, aes(month, revenue)) +
  geom_area(fill = "#3182bd", alpha = .3) +
  geom_line(color = "#3182bd", linewidth = 1) +
  geom_point(color = "#3182bd", size = 2) +
  scale_y_continuous(labels = dollar_format(prefix = "R$ ", big.mark = ",")) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "2 months") +
  labs(title = "Monthly Revenue", x = NULL, y = "Revenue (R$)") +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p2 <- ggplot(monthly_revenue, aes(month, n_orders)) +
  geom_col(fill = "#e6550d", alpha = .8) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "2 months") +
  scale_y_continuous(labels = comma) +
  labs(title = "Monthly Order Volume", x = NULL, y = "# Orders") +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p1 / p2

4.3 Order Status Distribution

orders %>%
  count(order_status) %>%
  mutate(pct = n / sum(n),
         order_status = fct_reorder(order_status, n)) %>%
  ggplot(aes(order_status, n, fill = order_status)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = percent(pct, .1)), hjust = -.1, size = 3.5) +
  coord_flip() +
  scale_fill_brewer(palette = "Set2") +
  scale_y_continuous(expand = expansion(mult = c(0, .15))) +
  labs(title = "Order Status Distribution",
       x = NULL, y = "# Orders") +
  theme_minimal(base_size = 12)

5 Q1 — Which Products Are Most Often Purchased?

5.1 Top Categories by Units Sold

top_cat <- delivered_items %>%
  count(category_en, wt = NULL, name = "units_sold") %>%
  mutate(revenue = (delivered_items %>%
                      group_by(category_en) %>%
                      summarise(r = sum(revenue)))[["r"]][
                        match(category_en, (delivered_items %>%
                                              group_by(category_en) %>%
                                              summarise(r = sum(revenue)))[["category_en"]])]) %>%
  arrange(desc(units_sold)) %>%
  slice_head(n = 20)

# Re-compute cleanly
top_cat <- delivered_items %>%
  group_by(category_en) %>%
  summarise(units_sold    = n(),
            total_revenue = sum(revenue, na.rm = TRUE),
            avg_price     = mean(price, na.rm = TRUE),
            .groups = "drop") %>%
  arrange(desc(units_sold)) %>%
  slice_head(n = 20)

ggplot(top_cat, aes(reorder(category_en, units_sold), units_sold,
                    fill = total_revenue)) +
  geom_col() +
  coord_flip() +
  scale_fill_viridis_c(labels = dollar_format(prefix = "R$ ", big.mark = ","),
                       name   = "Total Revenue") +
  scale_y_continuous(labels = comma) +
  labs(title = "Top 20 Product Categories by Units Sold",
       subtitle = "Color = Total Revenue",
       x = NULL, y = "Units Sold") +
  theme_minimal(base_size = 12)

5.2 Revenue vs. Volume Quadrant Chart

cat_summary <- delivered_items %>%
  group_by(category_en) %>%
  summarise(units_sold    = n(),
            total_revenue = sum(revenue, na.rm = TRUE),
            avg_price     = mean(price, na.rm = TRUE),
            .groups = "drop") %>%
  filter(units_sold >= 50)   # remove noise from tiny categories

med_units   <- median(cat_summary$units_sold)
med_revenue <- median(cat_summary$total_revenue)

cat_summary <- cat_summary %>%
  mutate(quadrant = case_when(
    units_sold >= med_units & total_revenue >= med_revenue ~ "Stars",
    units_sold >= med_units & total_revenue <  med_revenue ~ "Volume Leaders",
    units_sold <  med_units & total_revenue >= med_revenue ~ "Premium Niche",
    TRUE                                                   ~ "Underperformers"
  ))

ggplot(cat_summary, aes(units_sold, total_revenue,
                        color = quadrant, size = avg_price)) +
  geom_point(alpha = .75) +
  geom_vline(xintercept = med_units,   linetype = "dashed", color = "grey50") +
  geom_hline(yintercept = med_revenue, linetype = "dashed", color = "grey50") +
  geom_text_repel(data = cat_summary %>% slice_max(units_sold, n = 10),
                  aes(label = category_en), size = 3, show.legend = FALSE) +
  scale_x_log10(labels = comma) +
  scale_y_log10(labels = dollar_format(prefix = "R$ ", big.mark = ",")) +
  scale_color_brewer(palette = "Set1") +
  labs(title    = "Product Category Quadrant Analysis",
       subtitle = "Axes are on log scale; size = average unit price",
       x = "Units Sold (log)", y = "Total Revenue (log)",
       color = "Quadrant", size = "Avg Price (R$)") +
  theme_minimal(base_size = 12)

5.3 Treemap of Revenue by Category

delivered_items %>%
  group_by(category_en) %>%
  summarise(total_revenue = sum(revenue, na.rm = TRUE),
            units_sold    = n(), .groups = "drop") %>%
  slice_max(total_revenue, n = 30) %>%
  ggplot(aes(area = total_revenue, fill = units_sold,
             label = paste0(category_en, "\n",
                            dollar(total_revenue, prefix = "R$", scale = 1e-3,
                                   suffix = "K", big.mark = ",")))) +
  geom_treemap() +
  geom_treemap_text(colour = "white", place = "centre", grow = FALSE, size = 10) +
  scale_fill_viridis_c(name = "Units Sold", labels = comma) +
  labs(title = "Revenue Treemap – Top 30 Categories") +
  theme_minimal()

5.4 Top 20 Individual Products

top_products <- delivered_items %>%
  group_by(product_id) %>%
  summarise(units_sold    = n(),
            total_revenue = sum(revenue, na.rm = TRUE),
            avg_price     = mean(price, na.rm = TRUE),
            .groups = "drop") %>%
  left_join(products %>% select(product_id, category_en),
            by = "product_id") %>%
  arrange(desc(units_sold)) %>%
  slice_head(n = 20)

top_products %>%
  select(product_id, category_en, units_sold, total_revenue, avg_price) %>%
  mutate(total_revenue = dollar(total_revenue, prefix = "R$ ", big.mark = ","),
         avg_price     = dollar(avg_price,     prefix = "R$ ")) %>%
  datatable(caption  = "Top 20 Best-Selling Products",
            options  = list(pageLength = 10),
            rownames = FALSE,
            colnames = c("Product ID","Category","Units Sold",
                         "Total Revenue","Avg Price"))

5.5 Sales by Day-of-Week & Hour

orders_full %>%
  filter(order_status == "delivered") %>%
  mutate(hour = hour(order_purchase_timestamp),
         dow  = wday(order_purchase_timestamp, label = TRUE, week_start = 1)) %>%
  count(dow, hour) %>%
  ggplot(aes(hour, dow, fill = n)) +
  geom_tile(color = "white") +
  scale_fill_viridis_c(name = "# Orders", labels = comma) +
  scale_x_continuous(breaks = 0:23) +
  labs(title = "Purchase Heatmap: Day of Week × Hour",
       x = "Hour of Day", y = NULL) +
  theme_minimal(base_size = 12) +
  theme(panel.grid = element_blank())

6 Q2 — Products That Did NOT Find Their Customers

6.1 Unsold Products Overview

sold_summary <- product_sales %>%
  count(sold_flag) %>%
  mutate(pct   = n / sum(n),
         label = ifelse(sold_flag, "Sold", "Never Sold"))

ggplot(sold_summary, aes(x = 2, y = pct, fill = label)) +
  geom_col(width = 1, color = "white") +
  coord_polar(theta = "y", start = 0) +
  xlim(.5, 2.5) +
  geom_text(aes(label = paste0(label, "\n", percent(pct, .1))),
            position = position_stack(vjust = .5), size = 5) +
  scale_fill_manual(values = c("Never Sold" = "#d7301f",
                               "Sold"       = "#41ab5d")) +
  labs(title = "Product Catalogue: Sold vs. Never Sold") +
  theme_void() +
  theme(legend.position = "none")

6.2 Unsold Products by Category

# categories with the most unsold products
unsold_cat <- product_sales %>%
  group_by(category_en) %>%
  summarise(total     = n(),
            unsold    = sum(!sold_flag),
            sold      = sum(sold_flag),
            unsold_pct= unsold / total,
            .groups   = "drop") %>%
  filter(total >= 10) %>%
  arrange(desc(unsold_pct))

# Top 15 categories by unsold %
p_unsold_pct <- unsold_cat %>%
  slice_head(n = 15) %>%
  ggplot(aes(reorder(category_en, unsold_pct), unsold_pct, fill = unsold)) +
  geom_col() +
  coord_flip() +
  scale_fill_gradient(low = "#fdd0a2", high = "#d7301f", name = "# Unsold") +
  scale_y_continuous(labels = percent) +
  labs(title = "Top 15 Categories: Highest Unsold Rate",
       x = NULL, y = "Unsold Rate") +
  theme_minimal(base_size = 11)

# Top 15 categories by absolute # unsold
p_unsold_abs <- unsold_cat %>%
  slice_max(unsold, n = 15) %>%
  ggplot(aes(reorder(category_en, unsold), unsold, fill = unsold_pct)) +
  geom_col() +
  coord_flip() +
  scale_fill_gradient(low = "#fdd0a2", high = "#d7301f",
                      labels = percent, name = "Unsold %") +
  scale_y_continuous(labels = comma) +
  labs(title = "Top 15 Categories: Most Unsold Products (Absolute)",
       x = NULL, y = "# Unsold Products") +
  theme_minimal(base_size = 11)

p_unsold_pct / p_unsold_abs

6.3 Comparing Sold vs. Unsold Product Characteristics

compare_df <- product_sales %>%
  filter(!is.na(product_weight_g)) %>%
  mutate(group = ifelse(sold_flag, "Sold", "Never Sold"))

# Weight distribution
p_w <- ggplot(compare_df, aes(product_weight_g, fill = group)) +
  geom_density(alpha = .5) +
  scale_x_log10(labels = comma) +
  scale_fill_manual(values = c("Never Sold" = "#d7301f", "Sold" = "#41ab5d")) +
  labs(title = "Product Weight Distribution",
       x = "Weight (g, log scale)", fill = NULL) +
  theme_minimal(base_size = 11)

# Photo count
p_ph <- compare_df %>%
  filter(!is.na(product_photos_qty)) %>%
  ggplot(aes(factor(pmin(product_photos_qty, 10)), fill = group)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = c("Never Sold" = "#d7301f", "Sold" = "#41ab5d")) +
  scale_y_continuous(labels = percent) +
  labs(title = "Photo Count (capped @ 10)",
       x = "# Product Photos", y = "Proportion", fill = NULL) +
  theme_minimal(base_size = 11)

(p_w + p_ph + plot_layout(guides = "collect")) &
  theme(legend.position = "bottom")

6.4 Unsold Product Table

product_sales %>%
  filter(!sold_flag) %>%
  select(product_id, category_en,
         product_weight_g, product_photos_qty) %>%
  arrange(category_en) %>%
  datatable(caption  = "Never-Sold Products",
            options  = list(pageLength = 10, scrollX = TRUE),
            rownames = FALSE,
            colnames = c("Product ID","Category","Weight (g)","# Photos"))

7 Unsupervised Learning — Customer Segmentation (RFM + K-Means)

7.1 Build RFM Features

reference_date <- as.Date("2018-10-18")   # day after last purchase in dataset

rfm <- orders_full %>%
  filter(order_status == "delivered",
         !is.na(total_payment),
         total_payment > 0) %>%                        # guard: no zero monetary
  group_by(customer_unique_id) %>%
  summarise(
    recency   = as.numeric(reference_date - max(as.Date(order_purchase_timestamp),
                                                na.rm = TRUE)),
    frequency = n_distinct(order_id),
    monetary  = sum(total_payment, na.rm = TRUE),
    .groups   = "drop"
  ) %>%
  # drop any row where a value is NA, 0, negative, or Inf
  filter(
    if_all(c(recency, frequency, monetary),
           ~ !is.na(.) & is.finite(.) & . > 0)
  )

# Log-transform -> scale -> drop any column with zero variance
rfm_log <- rfm %>%
  mutate(across(c(recency, frequency, monetary), log1p)) %>%
  select(recency, frequency, monetary)

# scale() returns a matrix; keep as plain numeric matrix for kmeans
rfm_scaled <- scale(rfm_log)

# Final safety: remove rows still NA/NaN/Inf after scaling
# (can happen if a column has zero variance -> sd=0 -> division by 0)
ok_rows    <- complete.cases(rfm_scaled) &
              apply(rfm_scaled, 1, function(x) all(is.finite(x)))
rfm_scaled <- rfm_scaled[ok_rows, ]
rfm        <- rfm[ok_rows, ]

cat(sprintf("RFM table: %d customers (after cleaning)\n", nrow(rfm)))

## RFM table: 93356 customers (after cleaning)

cat(sprintf("NA in rfm_scaled: %d | Inf: %d\n",
            sum(is.na(rfm_scaled)), sum(!is.finite(rfm_scaled))))

## NA in rfm_scaled: 0 | Inf: 0

summary(rfm %>% select(recency, frequency, monetary))

##     recency        frequency         monetary       
##  Min.   : 50.0   Min.   : 1.000   Min.   :    9.59  
##  1st Qu.:164.0   1st Qu.: 1.000   1st Qu.:   63.06  
##  Median :268.0   Median : 1.000   Median :  107.78  
##  Mean   :287.5   Mean   : 1.033   Mean   :  165.20  
##  3rd Qu.:396.0   3rd Qu.: 1.000   3rd Qu.:  182.57  
##  Max.   :745.0   Max.   :15.000   Max.   :13664.08

7.2 Correlation Matrix

corrplot(cor(rfm %>% select(recency, frequency, monetary)),
         method = "color", addCoef.col = "black",
         tl.col = "black", tl.srt = 45,
         title  = "RFM Feature Correlations",
         mar    = c(0,0,2,0))

7.3 Optimal Number of Clusters

set.seed(42)

# fviz_nbclust iterates kmeans for every k on the full matrix -> O(n * k) RAM.
# With ~90k customers that blows memory. Sample 3 000 rows for diagnostics only;
# the final kmeans() below still runs on the full rfm_scaled.
SAMPLE_N   <- 3000
sample_idx <- sample(nrow(rfm_scaled), min(SAMPLE_N, nrow(rfm_scaled)))
rfm_sample <- rfm_scaled[sample_idx, ]

# ── Elbow (WSS) ──────────────────────────────────────────────────────────────
wss_vals <- map_dbl(1:10, function(k) {
  km <- kmeans(rfm_sample, centers = k, nstart = 10, iter.max = 100)
  km$tot.withinss
})

tibble(k = 1:10, wss = wss_vals) %>%
  ggplot(aes(k, wss)) +
  geom_line(color = "#3182bd", linewidth = 1) +
  geom_point(color = "#3182bd", size = 3) +
  scale_x_continuous(breaks = 1:10) +
  labs(title    = "Elbow Method — Optimal k for K-Means",
       subtitle = paste0("Computed on a random sample of ",
                         comma(nrow(rfm_sample)), " customers"),
       x = "Number of clusters k", y = "Total Within-Cluster SS") +
  theme_minimal(base_size = 12)

# ── Silhouette ───────────────────────────────────────────────────────────────
# Use a smaller sample for silhouette (O(n^2) distance matrix)
SIL_N      <- 1500
sil_idx    <- sample(nrow(rfm_scaled), min(SIL_N, nrow(rfm_scaled)))
rfm_sil    <- rfm_scaled[sil_idx, ]

sil_vals <- map_dbl(2:10, function(k) {
  km  <- kmeans(rfm_sil, centers = k, nstart = 10, iter.max = 100)
  ss  <- silhouette(km$cluster, dist(rfm_sil))
  mean(ss[, "sil_width"])
})

tibble(k = 2:10, sil = sil_vals) %>%
  ggplot(aes(k, sil)) +
  geom_line(color = "#e6550d", linewidth = 1) +
  geom_point(aes(color = sil == max(sil)), size = 3, show.legend = FALSE) +
  scale_color_manual(values = c("FALSE" = "#e6550d", "TRUE" = "#31a354")) +
  scale_x_continuous(breaks = 2:10) +
  labs(title    = "Silhouette Method — Optimal k",
       subtitle = paste0("Green point = best k | sample of ",
                         comma(nrow(rfm_sil)), " customers"),
       x = "Number of clusters k", y = "Average Silhouette Width") +
  theme_minimal(base_size = 12)

7.4 K-Means Clustering (k = 4)

set.seed(42)
K <- 4
km_fit <- kmeans(rfm_scaled, centers = K, nstart = 50, iter.max = 300)

rfm <- rfm %>% mutate(cluster = factor(km_fit$cluster))

cat("Cluster sizes:\n")

## Cluster sizes:

print(table(rfm$cluster))

## 
##     1     2     3     4 
## 27886 37861 24808  2801

7.5 Cluster Visualisation

# PCA for 2-D projection
fviz_cluster(km_fit, data = rfm_scaled,
             geom = "point", ellipse.type = "convex",
             palette = "Set1", ggtheme = theme_minimal(base_size = 12),
             main = "K-Means Customer Clusters (PCA Projection)")

7.6 Cluster Profiles

cluster_profile <- rfm %>%
  group_by(cluster) %>%
  summarise(
    n_customers  = n(),
    avg_recency  = round(mean(recency)),
    avg_frequency= round(mean(frequency), 2),
    avg_monetary = round(mean(monetary), 2),
    .groups = "drop"
  ) %>%
  mutate(
    segment_label = case_when(
      avg_recency <  median(rfm$recency) & avg_frequency > 1 ~ "Champions",
      avg_recency <  median(rfm$recency)                      ~ "Promising",
      avg_monetary > quantile(rfm$monetary, .75)              ~ "At-Risk High Value",
      TRUE                                                    ~ "Hibernating"
    )
  )

cluster_profile %>%
  kbl(caption = "RFM Cluster Summary",
      col.names = c("Cluster","# Customers","Avg Recency (days)",
                    "Avg Frequency","Avg Monetary (R$)","Segment")) %>%
  kable_styling(bootstrap_options = c("striped","hover"),
                full_width = FALSE) %>%
  column_spec(6, bold = TRUE, color = "white",
              background = c("#3182bd","#41ab5d","#d7301f","#fd8d3c"))

RFM Cluster Summary
Cluster	# Customers	Avg Recency (days)	Avg Frequency	Avg Monetary (R$)	Segment
1	27886	335	1.00	318.87	At-Risk High Value
2	37861	366	1.00	69.36	Hibernating
3	24808	117	1.00	122.54	Promising
4	2801	270	2.11	308.59	At-Risk High Value

Olist E-Commerce Sales Structure Analysis

Product Performance, Customer Segmentation & Predictive Modeling

Sales Analytics Team

2026-06-09

1 Setup & Libraries

2 Data Loading

3 Data Wrangling & Feature Engineering

4 Exploratory Data Analysis

4.1 Overview Dashboard

4.2 Sales Timeline

4.3 Order Status Distribution

5 Q1 — Which Products Are Most Often Purchased?

5.1 Top Categories by Units Sold

5.2 Revenue vs. Volume Quadrant Chart

5.3 Treemap of Revenue by Category

5.4 Top 20 Individual Products

5.5 Sales by Day-of-Week & Hour

6 Q2 — Products That Did NOT Find Their Customers

6.1 Unsold Products Overview

6.2 Unsold Products by Category

6.3 Comparing Sold vs. Unsold Product Characteristics

6.4 Unsold Product Table

7 Unsupervised Learning — Customer Segmentation (RFM + K-Means)

7.1 Build RFM Features

7.2 Correlation Matrix

7.3 Optimal Number of Clusters

7.4 K-Means Clustering (k = 4)

7.5 Cluster Visualisation

7.6 Cluster Profiles