knitr::opts_chunk$set(
  echo = TRUE,
  message = FALSE,
  warning = FALSE,
  fig.width = 10,
  fig.height = 6
)

# Install once if needed:
# install.packages(c("tidyverse", "janitor", "lubridate", "scales", "knitr"))

library(tidyverse)
library(janitor)
library(lubridate)
library(scales)
library(knitr)

theme_set(theme_minimal(base_size = 12))
set.seed(123)

1 Objective

This report creates four customer segments using k-means clustering with Mahalanobis distance logic.

Standard k-means minimizes Euclidean distance. To use Mahalanobis distance, the numeric customer feature matrix is whitened with the inverse square root of its covariance matrix. Euclidean k-means on the whitened matrix is equivalent to k-means in the original feature space using Mahalanobis distance.

The cluster with the largest average Mahalanobis distance from the global customer center is interpreted as the outlier cluster.

2 Load Data

customers <- read_csv("olist_customers_dataset.csv", show_col_types = FALSE) %>%
  clean_names()

orders <- read_csv("olist_orders_dataset.csv", show_col_types = FALSE) %>%
  clean_names()

order_items <- read_csv("olist_order_items_dataset.csv", show_col_types = FALSE) %>%
  clean_names()

payments <- read_csv("olist_order_payments_dataset.csv", show_col_types = FALSE) %>%
  clean_names()

reviews <- read_csv("olist_order_reviews_dataset.csv", show_col_types = FALSE) %>%
  clean_names()

list(
  customers = dim(customers),
  orders = dim(orders),
  order_items = dim(order_items),
  payments = dim(payments),
  reviews = dim(reviews)
)
## $customers
## [1] 99441     5
## 
## $orders
## [1] 99441     8
## 
## $order_items
## [1] 112650      7
## 
## $payments
## [1] 103886      5
## 
## $reviews
## [1] 100000      7

3 Build Customer-Level Features

orders_clean <- orders %>%
  mutate(
    order_purchase_timestamp = ymd_hms(order_purchase_timestamp),
    order_delivered_customer_date = ymd_hms(order_delivered_customer_date),
    order_estimated_delivery_date = ymd_hms(order_estimated_delivery_date),
    delivered_late = case_when(
      !is.na(order_delivered_customer_date) & !is.na(order_estimated_delivery_date) ~
        as.numeric(order_delivered_customer_date > order_estimated_delivery_date),
      TRUE ~ NA_real_
    ),
    delivery_days = as.numeric(
      difftime(order_delivered_customer_date, order_purchase_timestamp, units = "days")
    )
  )

item_order_summary <- order_items %>%
  group_by(order_id) %>%
  summarise(
    items = n(),
    item_spend = sum(price, na.rm = TRUE),
    freight_spend = sum(freight_value, na.rm = TRUE),
    sellers = n_distinct(seller_id),
    products = n_distinct(product_id),
    .groups = "drop"
  )

payment_order_summary <- payments %>%
  group_by(order_id) %>%
  summarise(
    payment_value = sum(payment_value, na.rm = TRUE),
    avg_installments = mean(payment_installments, na.rm = TRUE),
    payment_methods = n_distinct(payment_type),
    .groups = "drop"
  )

review_order_summary <- reviews %>%
  group_by(order_id) %>%
  summarise(
    review_score = mean(review_score, na.rm = TRUE),
    .groups = "drop"
  )

order_analysis <- orders_clean %>%
  left_join(customers, by = "customer_id") %>%
  left_join(item_order_summary, by = "order_id") %>%
  left_join(payment_order_summary, by = "order_id") %>%
  left_join(review_order_summary, by = "order_id") %>%
  filter(order_status == "delivered") %>%
  mutate(
    items = replace_na(items, 0),
    item_spend = replace_na(item_spend, 0),
    freight_spend = replace_na(freight_spend, 0),
    payment_value = coalesce(payment_value, item_spend + freight_spend),
    sellers = replace_na(sellers, 0),
    products = replace_na(products, 0),
    payment_methods = replace_na(payment_methods, 0)
  )

analysis_end_date <- max(order_analysis$order_purchase_timestamp, na.rm = TRUE)
customer_features <- order_analysis %>%
  group_by(customer_unique_id, customer_state) %>%
  summarise(
    orders = n_distinct(order_id),
    total_items = sum(items, na.rm = TRUE),
    total_products = sum(products, na.rm = TRUE),
    total_spend = sum(payment_value, na.rm = TRUE),
    item_spend = sum(item_spend, na.rm = TRUE),
    freight_spend = sum(freight_spend, na.rm = TRUE),
    avg_order_value = mean(payment_value, na.rm = TRUE),
    avg_items_per_order = mean(items, na.rm = TRUE),
    avg_sellers_per_order = mean(sellers, na.rm = TRUE),
    avg_installments = mean(avg_installments, na.rm = TRUE),
    avg_review_score = mean(review_score, na.rm = TRUE),
    avg_delivery_days = mean(delivery_days, na.rm = TRUE),
    late_delivery_rate = mean(delivered_late, na.rm = TRUE),
    first_purchase = min(order_purchase_timestamp, na.rm = TRUE),
    last_purchase = max(order_purchase_timestamp, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    active_days = pmax(
      as.numeric(difftime(last_purchase, first_purchase, units = "days")),
      0
    ),
    recency_days = as.numeric(difftime(analysis_end_date, last_purchase, units = "days")),
    freight_share = freight_spend / pmax(total_spend, 1)
  )

customer_features %>%
  select(
    customer_unique_id,
    customer_state,
    orders,
    total_items,
    total_spend,
    avg_order_value,
    avg_review_score,
    recency_days
  ) %>%
  slice_head(n = 10) %>%
  kable()
customer_unique_id customer_state orders total_items total_spend avg_order_value avg_review_score recency_days
0000366f3b9a7992bf8c76cfdf3221e2 SP 1 1 141.90 141.90 5 111.1696
0000b849f77a49e4a4ce2b2a4ca5be3f SP 1 1 27.19 27.19 4 114.1591
0000f46a3911fa3c0805444483337064 SC 1 1 86.22 86.22 3 536.7469
0000f6ccb0745a6a4b88665a16c9f078 PA 1 1 43.62 43.62 4 320.7715
0004aac84e0df4da2b147fca70cf8255 SP 1 1 196.89 196.89 5 287.8020
0004bd2a26a76fe21f786e4fbd80607f SP 1 1 166.98 166.98 4 145.8107
00050ab1314c0e55a6ca13cf7181fecf SP 1 1 35.38 35.38 4 131.0856
00053a61a98854899e70ed204dd4bafe PR 1 2 419.18 419.18 1 182.1562
0005e1862207bf6ccc02e4228effd9a0 RJ 1 1 150.12 150.12 4 542.6447
0005ef4cd20d2893f0d9fbd94d3c0d97 MA 1 1 129.76 129.76 1 169.9850

4 Mahalanobis Feature Transformation

The features below are transformed before clustering. Highly skewed monetary and count variables use log1p. Missing or infinite values are replaced with medians.

replace_bad_numeric <- function(x) {
  x[is.nan(x) | is.infinite(x)] <- NA_real_
  fallback <- median(x, na.rm = TRUE)
  if (is.na(fallback) || is.nan(fallback) || is.infinite(fallback)) {
    fallback <- 0
  }
  replace_na(x, fallback)
}

cluster_input <- customer_features %>%
  select(
    orders,
    total_items,
    total_products,
    total_spend,
    item_spend,
    freight_spend,
    avg_order_value,
    avg_items_per_order,
    avg_sellers_per_order,
    avg_installments,
    avg_review_score,
    avg_delivery_days,
    late_delivery_rate,
    active_days,
    recency_days,
    freight_share
  ) %>%
  mutate(
    across(
      c(
        orders,
        total_items,
        total_products,
        total_spend,
        item_spend,
        freight_spend,
        avg_order_value,
        avg_items_per_order,
        avg_sellers_per_order,
        avg_installments,
        avg_delivery_days,
        active_days,
        recency_days
      ),
      log1p
    ),
    across(everything(), replace_bad_numeric)
  )

feature_center <- colMeans(cluster_input)
feature_covariance <- cov(cluster_input)

# A small ridge term keeps the covariance matrix invertible when features are correlated.
ridge <- 1e-6
regularized_covariance <- feature_covariance +
  diag(ridge, ncol(feature_covariance))

eig <- eigen(regularized_covariance)
inverse_sqrt_covariance <- eig$vectors %*%
  diag(1 / sqrt(pmax(eig$values, ridge))) %*%
  t(eig$vectors)

whitened_features <- sweep(as.matrix(cluster_input), 2, feature_center, "-") %*%
  inverse_sqrt_covariance

colnames(whitened_features) <- paste0("z_", names(cluster_input))

customer_mahalanobis <- sqrt(rowSums(whitened_features^2))

summary(customer_mahalanobis)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   0.8343   1.9408   2.5011   3.1121   3.3474 113.8339

5 K-Means Clustering: 4 Segments

k <- 4

kmeans_fit <- kmeans(
  whitened_features,
  centers = k,
  nstart = 10,
  iter.max = 50
)

segmented_customers <- customer_features %>%
  mutate(
    cluster = factor(kmeans_fit$cluster),
    mahalanobis_distance = customer_mahalanobis,
    mahalanobis_distance_sq = mahalanobis_distance^2
  )

cluster_profiles <- segmented_customers %>%
  group_by(cluster) %>%
  summarise(
    customers = n(),
    share_of_customers = customers / nrow(segmented_customers),
    avg_orders = mean(orders),
    median_orders = median(orders),
    avg_total_spend = mean(total_spend),
    median_total_spend = median(total_spend),
    avg_order_value = mean(avg_order_value),
    avg_items_per_order = mean(avg_items_per_order),
    avg_review_score = mean(avg_review_score, na.rm = TRUE),
    avg_recency_days = mean(recency_days),
    avg_late_delivery_rate = mean(late_delivery_rate, na.rm = TRUE),
    avg_mahalanobis_distance = mean(mahalanobis_distance),
    max_mahalanobis_distance = max(mahalanobis_distance),
    .groups = "drop"
  ) %>%
  arrange(desc(avg_mahalanobis_distance))

outlier_cluster <- cluster_profiles %>%
  slice_max(avg_mahalanobis_distance, n = 1, with_ties = FALSE) %>%
  pull(cluster)

cluster_profiles <- cluster_profiles %>%
  mutate(
    segment_type = if_else(cluster == outlier_cluster, "Outlier cluster", "Regular cluster")
  )

segmented_customers <- segmented_customers %>%
  mutate(
    segment_type = if_else(cluster == outlier_cluster, "Outlier cluster", "Regular cluster")
  )

cluster_profiles %>%
  mutate(
    share_of_customers = percent(share_of_customers, accuracy = 0.1),
    avg_total_spend = dollar(avg_total_spend, prefix = "R$"),
    median_total_spend = dollar(median_total_spend, prefix = "R$"),
    avg_order_value = dollar(avg_order_value, prefix = "R$")
  ) %>%
  kable()
cluster customers share_of_customers avg_orders median_orders avg_total_spend median_total_spend avg_order_value avg_items_per_order avg_review_score avg_recency_days avg_late_delivery_rate avg_mahalanobis_distance max_mahalanobis_distance segment_type
1 1255 1.3% 1.099602 1 R$272.81 R$214.15 R$253.68 2.356707 2.864210 204.00855 NaN 9.856549 30.60864 Outlier cluster
4 17968 19.2% 1.026046 1 R$175.33 R$112.13 R$171.23 1.215847 1.880065 Inf NaN 3.617042 33.54303 Regular cluster
3 17916 19.2% 1.032317 1 R$175.86 R$111.62 R$170.73 1.094312 4.604260 49.17388 NaN 3.287432 113.83391 Regular cluster
2 56257 60.2% 1.033951 1 R$156.06 R$104.19 R$151.42 1.102432 4.743042 291.79117 NaN 2.744586 73.69074 Regular cluster

6 Segment Visualization

pca_fit <- prcomp(whitened_features, center = FALSE, scale. = FALSE)

pca_scores <- as_tibble(pca_fit$x[, 1:2]) %>%
  bind_cols(segmented_customers %>% select(customer_unique_id, cluster, segment_type, mahalanobis_distance))

plot_sample_n <- min(15000, nrow(pca_scores))

pca_scores %>%
  slice_sample(n = plot_sample_n) %>%
  ggplot(aes(PC1, PC2, color = cluster, shape = segment_type)) +
  geom_point(alpha = 0.45, size = 1.5) +
  labs(
    title = "Four Customer Segments in Mahalanobis-Whitened PCA Space",
    subtitle = paste0("Cluster ", outlier_cluster, " is labeled as the outlier cluster"),
    x = "Principal component 1",
    y = "Principal component 2",
    color = "Cluster",
    shape = "Segment type"
  )

segmented_customers %>%
  ggplot(aes(cluster, mahalanobis_distance, fill = segment_type)) +
  geom_boxplot(outlier.alpha = 0.35) +
  scale_y_continuous(labels = comma) +
  labs(
    title = "Mahalanobis Distance by Customer Cluster",
    subtitle = "The outlier cluster has the largest average distance from the global customer center",
    x = "Cluster",
    y = "Mahalanobis distance",
    fill = "Segment type"
  )

cluster_profiles %>%
  mutate(cluster = fct_reorder(cluster, customers)) %>%
  ggplot(aes(customers, cluster, fill = segment_type)) +
  geom_col() +
  geom_text(aes(label = comma(customers)), hjust = -0.08, size = 3.4) +
  scale_x_continuous(labels = comma, expand = expansion(mult = c(0, 0.12))) +
  labs(
    title = "Customer Count by Segment",
    x = "Customers",
    y = "Cluster",
    fill = "Segment type"
  )

7 Segment Profiles

profile_long <- segmented_customers %>%
  select(
    cluster,
    segment_type,
    orders,
    total_items,
    total_spend,
    avg_order_value,
    avg_items_per_order,
    avg_review_score,
    avg_delivery_days,
    late_delivery_rate,
    active_days,
    recency_days,
    freight_share,
    mahalanobis_distance
  ) %>%
  group_by(cluster, segment_type) %>%
  summarise(across(where(is.numeric), mean, na.rm = TRUE), .groups = "drop") %>%
  pivot_longer(
    cols = where(is.numeric),
    names_to = "feature",
    values_to = "value"
  ) %>%
  group_by(feature) %>%
  mutate(
    normalized_value = as.numeric(scale(value))
  ) %>%
  ungroup()

profile_long %>%
  ggplot(aes(feature, cluster, fill = normalized_value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(
    low = "#28666e",
    mid = "white",
    high = "#9a4d3f",
    midpoint = 0
  ) +
  labs(
    title = "Normalized Customer Segment Profiles",
    subtitle = "Red means the cluster average is higher than the feature average; teal means lower",
    x = "Feature",
    y = "Cluster",
    fill = "Scaled average"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

8 Outlier Customers

The following table shows customers in the outlier cluster with the largest Mahalanobis distances.

segmented_customers %>%
  filter(cluster == outlier_cluster) %>%
  arrange(desc(mahalanobis_distance)) %>%
  select(
    customer_unique_id,
    customer_state,
    cluster,
    mahalanobis_distance,
    orders,
    total_items,
    total_spend,
    avg_order_value,
    avg_review_score,
    recency_days,
    late_delivery_rate
  ) %>%
  slice_head(n = 25) %>%
  mutate(
    total_spend = dollar(total_spend, prefix = "R$"),
    avg_order_value = dollar(avg_order_value, prefix = "R$"),
    late_delivery_rate = percent(late_delivery_rate, accuracy = 0.1)
  ) %>%
  kable()
customer_unique_id customer_state cluster mahalanobis_distance orders total_items total_spend avg_order_value avg_review_score recency_days late_delivery_rate
a3dc27d87234f862d7e61a0148c77fa9 SP 1 30.60864 3 11 R$628.63 R$209.54 5.000000 138.202396 NA
310647380793836bfa5b7b6b3f518423 SP 1 29.43438 3 10 R$385.27 R$128.42 5.000000 315.059595 NA
1da09dd64e235e7c2f29a4faff33535c RJ 1 29.37612 3 10 R$2,164.40 R$721.47 3.666667 230.155417 NA
d97b3cfb22b0d6b25ac9ed4e9c2d481b SP 1 28.36879 2 13 R$1,044.13 R$522.07 1.000000 316.079248 NA
a40096fc0a3862e9e12bc55b5f8e6ab2 RJ 1 27.26608 3 8 R$457.18 R$152.39 1.000000 322.007488 NA
efce1ab3e96ccab8b1b464326bd22417 SP 1 26.97205 2 11 R$904.72 R$452.36 5.000000 155.021921 NA
6f90ec5150be36c4475835b5941ab56f PR 1 26.43947 1 6 R$406.92 R$406.92 1.000000 688.683472 NA
70ad91901ad4330af432978ff134fc6f SP 1 25.37495 2 11 R$660.44 R$330.22 5.000000 162.931366 NA
b18c8cbd9616a95055ef4146f3f805e1 SP 1 23.68545 2 9 R$242.19 R$121.10 4.000000 85.980127 NA
8dc697d03f771cecc2534534a73eaaf9 MT 1 23.52040 1 6 R$217.18 R$217.18 1.000000 5.080926 NA
41a3b256cc497dc952a815b848345cbc RS 1 23.43077 1 7 R$421.69 R$421.69 1.000000 214.147280 NA
08a374bca4063116d5530a7b04ecaf3f SC 1 21.99691 2 7 R$342.77 R$171.38 5.000000 16.103252 NA
8bd9c53ae5e194fd363ed6714235e6be MG 1 21.47973 1 3 R$274.84 R$274.84 1.000000 240.078935 NA
e079b18794454de9d2be5c12b4392294 RJ 1 21.33996 2 6 R$437.78 R$218.89 5.000000 440.838981 NA
6dcf48fcc0c4d96dd7af8a0897ae9de8 SC 1 21.06965 2 7 R$447.28 R$223.64 3.000000 43.024144 NA
41b4af321ed477242aebe74652fd8219 SP 1 20.56987 2 7 R$254.23 R$127.12 5.000000 78.749526 NA
c8ed31310fc440a3f8031b177f9842c3 SP 1 20.45170 1 10 R$1,157.28 R$1,157.28 3.000000 17.534225 NA
ac95cdab584cac0e86ce97a5399ecf79 MG 1 20.11288 2 6 R$886.12 R$443.06 3.000000 143.097789 NA
76b61cd27ca2933a2bd2669762fb229a RS 1 19.44616 1 4 R$214.19 R$214.19 1.000000 214.945174 NA
07547425c8f7a7086f529f2d3b80d21a SP 1 19.44164 1 4 R$397.90 R$397.90 1.000000 77.960116 NA
d00cb93157046651ca3e21a3fc422413 RS 1 19.32545 1 4 R$547.29 R$547.29 1.000000 284.801724 NA
630331e73a79ae9a2ba821bd326ea298 SC 1 18.60385 2 5 R$233.07 R$116.54 5.000000 420.296100 NA
473d9165586cfc6708841d07adf6c020 SP 1 18.33637 2 5 R$645.98 R$322.99 4.500000 105.180637 NA
bf869f6a89c8ba217f47e22359f884f2 SP 1 18.27843 1 7 R$209.57 R$209.57 1.000000 13.438009 NA
ed4c785e11a78a73c48fa3af98008292 MG 1 18.25309 2 5 R$409.70 R$204.85 5.000000 102.017616 NA

9 Interpretation

outlier_profile <- cluster_profiles %>%
  filter(cluster == outlier_cluster)

largest_regular_cluster <- cluster_profiles %>%
  filter(cluster != outlier_cluster) %>%
  slice_max(customers, n = 1, with_ties = FALSE)

tibble(
  finding = c(
    "Number of clusters",
    "Outlier cluster",
    "Outlier-cluster size",
    "Outlier-cluster average Mahalanobis distance",
    "Largest regular cluster",
    "Largest regular-cluster size"
  ),
  value = c(
    k,
    as.character(outlier_cluster),
    paste0(comma(outlier_profile$customers), " customers"),
    comma(outlier_profile$avg_mahalanobis_distance, accuracy = 0.01),
    as.character(largest_regular_cluster$cluster),
    paste0(comma(largest_regular_cluster$customers), " customers")
  )
) %>%
  kable()
finding value
Number of clusters 4
Outlier cluster 1
Outlier-cluster size 1,255 customers
Outlier-cluster average Mahalanobis distance 9.86
Largest regular cluster 2
Largest regular-cluster size 56,257 customers

The outlier cluster is not defined as “bad” customers. It is the group whose purchasing behavior is most statistically unusual after accounting for relationships among the features. These customers may be unusually high spenders, unusually frequent buyers, customers with large order values, customers with unusual delivery or review patterns, or a combination of these traits.