1 Objective
2 Load Data
3 Build Customer-Level Features
4 Mahalanobis Feature Transformation
5 K-Means Clustering: 4 Segments
6 Segment Visualization
7 Segment Profiles
8 Outlier Customers
9 Interpretation

knitr::opts_chunk$set(
  echo = TRUE,
  message = FALSE,
  warning = FALSE,
  fig.width = 10,
  fig.height = 6
)

# Install once if needed:
# install.packages(c("tidyverse", "janitor", "lubridate", "scales", "knitr"))

library(tidyverse)
library(janitor)
library(lubridate)
library(scales)
library(knitr)

theme_set(theme_minimal(base_size = 12))
set.seed(123)

1 Objective

This report creates four customer segments using k-means clustering with Mahalanobis distance logic.

Standard k-means minimizes Euclidean distance. To use Mahalanobis distance, the numeric customer feature matrix is whitened with the inverse square root of its covariance matrix. Euclidean k-means on the whitened matrix is equivalent to k-means in the original feature space using Mahalanobis distance.

The cluster with the largest average Mahalanobis distance from the global customer center is interpreted as the outlier cluster.

2 Load Data

customers <- read_csv("olist_customers_dataset.csv", show_col_types = FALSE) %>%
  clean_names()

orders <- read_csv("olist_orders_dataset.csv", show_col_types = FALSE) %>%
  clean_names()

order_items <- read_csv("olist_order_items_dataset.csv", show_col_types = FALSE) %>%
  clean_names()

payments <- read_csv("olist_order_payments_dataset.csv", show_col_types = FALSE) %>%
  clean_names()

reviews <- read_csv("olist_order_reviews_dataset.csv", show_col_types = FALSE) %>%
  clean_names()

list(
  customers = dim(customers),
  orders = dim(orders),
  order_items = dim(order_items),
  payments = dim(payments),
  reviews = dim(reviews)
)

## $customers
## [1] 99441     5
## 
## $orders
## [1] 99441     8
## 
## $order_items
## [1] 112650      7
## 
## $payments
## [1] 103886      5
## 
## $reviews
## [1] 100000      7

3 Build Customer-Level Features

orders_clean <- orders %>%
  mutate(
    order_purchase_timestamp = ymd_hms(order_purchase_timestamp),
    order_delivered_customer_date = ymd_hms(order_delivered_customer_date),
    order_estimated_delivery_date = ymd_hms(order_estimated_delivery_date),
    delivered_late = case_when(
      !is.na(order_delivered_customer_date) & !is.na(order_estimated_delivery_date) ~
        as.numeric(order_delivered_customer_date > order_estimated_delivery_date),
      TRUE ~ NA_real_
    ),
    delivery_days = as.numeric(
      difftime(order_delivered_customer_date, order_purchase_timestamp, units = "days")
    )
  )

item_order_summary <- order_items %>%
  group_by(order_id) %>%
  summarise(
    items = n(),
    item_spend = sum(price, na.rm = TRUE),
    freight_spend = sum(freight_value, na.rm = TRUE),
    sellers = n_distinct(seller_id),
    products = n_distinct(product_id),
    .groups = "drop"
  )

payment_order_summary <- payments %>%
  group_by(order_id) %>%
  summarise(
    payment_value = sum(payment_value, na.rm = TRUE),
    avg_installments = mean(payment_installments, na.rm = TRUE),
    payment_methods = n_distinct(payment_type),
    .groups = "drop"
  )

review_order_summary <- reviews %>%
  group_by(order_id) %>%
  summarise(
    review_score = mean(review_score, na.rm = TRUE),
    .groups = "drop"
  )

order_analysis <- orders_clean %>%
  left_join(customers, by = "customer_id") %>%
  left_join(item_order_summary, by = "order_id") %>%
  left_join(payment_order_summary, by = "order_id") %>%
  left_join(review_order_summary, by = "order_id") %>%
  filter(order_status == "delivered") %>%
  mutate(
    items = replace_na(items, 0),
    item_spend = replace_na(item_spend, 0),
    freight_spend = replace_na(freight_spend, 0),
    payment_value = coalesce(payment_value, item_spend + freight_spend),
    sellers = replace_na(sellers, 0),
    products = replace_na(products, 0),
    payment_methods = replace_na(payment_methods, 0)
  )

analysis_end_date <- max(order_analysis$order_purchase_timestamp, na.rm = TRUE)

customer_features <- order_analysis %>%
  group_by(customer_unique_id, customer_state) %>%
  summarise(
    orders = n_distinct(order_id),
    total_items = sum(items, na.rm = TRUE),
    total_products = sum(products, na.rm = TRUE),
    total_spend = sum(payment_value, na.rm = TRUE),
    item_spend = sum(item_spend, na.rm = TRUE),
    freight_spend = sum(freight_spend, na.rm = TRUE),
    avg_order_value = mean(payment_value, na.rm = TRUE),
    avg_items_per_order = mean(items, na.rm = TRUE),
    avg_sellers_per_order = mean(sellers, na.rm = TRUE),
    avg_installments = mean(avg_installments, na.rm = TRUE),
    avg_review_score = mean(review_score, na.rm = TRUE),
    avg_delivery_days = mean(delivery_days, na.rm = TRUE),
    late_delivery_rate = mean(delivered_late, na.rm = TRUE),
    first_purchase = min(order_purchase_timestamp, na.rm = TRUE),
    last_purchase = max(order_purchase_timestamp, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    active_days = pmax(
      as.numeric(difftime(last_purchase, first_purchase, units = "days")),
      0
    ),
    recency_days = as.numeric(difftime(analysis_end_date, last_purchase, units = "days")),
    freight_share = freight_spend / pmax(total_spend, 1)
  )

customer_features %>%
  select(
    customer_unique_id,
    customer_state,
    orders,
    total_items,
    total_spend,
    avg_order_value,
    avg_review_score,
    recency_days
  ) %>%
  slice_head(n = 10) %>%
  kable()

customer_unique_id	customer_state	orders	total_items	total_spend	avg_order_value	avg_review_score	recency_days
0000366f3b9a7992bf8c76cfdf3221e2	SP	1	1	141.90	141.90	5	111.1696
0000b849f77a49e4a4ce2b2a4ca5be3f	SP	1	1	27.19	27.19	4	114.1591
0000f46a3911fa3c0805444483337064	SC	1	1	86.22	86.22	3	536.7469
0000f6ccb0745a6a4b88665a16c9f078	PA	1	1	43.62	43.62	4	320.7715
0004aac84e0df4da2b147fca70cf8255	SP	1	1	196.89	196.89	5	287.8020
0004bd2a26a76fe21f786e4fbd80607f	SP	1	1	166.98	166.98	4	145.8107
00050ab1314c0e55a6ca13cf7181fecf	SP	1	1	35.38	35.38	4	131.0856
00053a61a98854899e70ed204dd4bafe	PR	1	2	419.18	419.18	1	182.1562
0005e1862207bf6ccc02e4228effd9a0	RJ	1	1	150.12	150.12	4	542.6447
0005ef4cd20d2893f0d9fbd94d3c0d97	MA	1	1	129.76	129.76	1	169.9850

4 Mahalanobis Feature Transformation

The features below are transformed before clustering. Highly skewed monetary and count variables use log1p. Missing or infinite values are replaced with medians.

replace_bad_numeric <- function(x) {
  x[is.nan(x) | is.infinite(x)] <- NA_real_
  fallback <- median(x, na.rm = TRUE)
  if (is.na(fallback) || is.nan(fallback) || is.infinite(fallback)) {
    fallback <- 0
  }
  replace_na(x, fallback)
}

cluster_input <- customer_features %>%
  select(
    orders,
    total_items,
    total_products,
    total_spend,
    item_spend,
    freight_spend,
    avg_order_value,
    avg_items_per_order,
    avg_sellers_per_order,
    avg_installments,
    avg_review_score,
    avg_delivery_days,
    late_delivery_rate,
    active_days,
    recency_days,
    freight_share
  ) %>%
  mutate(
    across(
      c(
        orders,
        total_items,
        total_products,
        total_spend,
        item_spend,
        freight_spend,
        avg_order_value,
        avg_items_per_order,
        avg_sellers_per_order,
        avg_installments,
        avg_delivery_days,
        active_days,
        recency_days
      ),
      log1p
    ),
    across(everything(), replace_bad_numeric)
  )

feature_center <- colMeans(cluster_input)
feature_covariance <- cov(cluster_input)

# A small ridge term keeps the covariance matrix invertible when features are correlated.
ridge <- 1e-6
regularized_covariance <- feature_covariance +
  diag(ridge, ncol(feature_covariance))

eig <- eigen(regularized_covariance)
inverse_sqrt_covariance <- eig$vectors %*%
  diag(1 / sqrt(pmax(eig$values, ridge))) %*%
  t(eig$vectors)

whitened_features <- sweep(as.matrix(cluster_input), 2, feature_center, "-") %*%
  inverse_sqrt_covariance

colnames(whitened_features) <- paste0("z_", names(cluster_input))

customer_mahalanobis <- sqrt(rowSums(whitened_features^2))

summary(customer_mahalanobis)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   0.8343   1.9408   2.5011   3.1121   3.3474 113.8339

5 K-Means Clustering: 4 Segments

k <- 4

kmeans_fit <- kmeans(
  whitened_features,
  centers = k,
  nstart = 10,
  iter.max = 50
)

segmented_customers <- customer_features %>%
  mutate(
    cluster = factor(kmeans_fit$cluster),
    mahalanobis_distance = customer_mahalanobis,
    mahalanobis_distance_sq = mahalanobis_distance^2
  )

cluster_profiles <- segmented_customers %>%
  group_by(cluster) %>%
  summarise(
    customers = n(),
    share_of_customers = customers / nrow(segmented_customers),
    avg_orders = mean(orders),
    median_orders = median(orders),
    avg_total_spend = mean(total_spend),
    median_total_spend = median(total_spend),
    avg_order_value = mean(avg_order_value),
    avg_items_per_order = mean(avg_items_per_order),
    avg_review_score = mean(avg_review_score, na.rm = TRUE),
    avg_recency_days = mean(recency_days),
    avg_late_delivery_rate = mean(late_delivery_rate, na.rm = TRUE),
    avg_mahalanobis_distance = mean(mahalanobis_distance),
    max_mahalanobis_distance = max(mahalanobis_distance),
    .groups = "drop"
  ) %>%
  arrange(desc(avg_mahalanobis_distance))

outlier_cluster <- cluster_profiles %>%
  slice_max(avg_mahalanobis_distance, n = 1, with_ties = FALSE) %>%
  pull(cluster)

cluster_profiles <- cluster_profiles %>%
  mutate(
    segment_type = if_else(cluster == outlier_cluster, "Outlier cluster", "Regular cluster")
  )

segmented_customers <- segmented_customers %>%
  mutate(
    segment_type = if_else(cluster == outlier_cluster, "Outlier cluster", "Regular cluster")
  )

cluster_profiles %>%
  mutate(
    share_of_customers = percent(share_of_customers, accuracy = 0.1),
    avg_total_spend = dollar(avg_total_spend, prefix = "R$"),
    median_total_spend = dollar(median_total_spend, prefix = "R$"),
    avg_order_value = dollar(avg_order_value, prefix = "R$")
  ) %>%
  kable()

cluster	customers	share_of_customers	avg_orders	median_orders	avg_total_spend	median_total_spend	avg_order_value	avg_items_per_order	avg_review_score	avg_recency_days	avg_late_delivery_rate	avg_mahalanobis_distance	max_mahalanobis_distance	segment_type
1	1255	1.3%	1.099602	1	R$272.81	R$214.15	R$253.68	2.356707	2.864210	204.00855	NaN	9.856549	30.60864	Outlier cluster
4	17968	19.2%	1.026046	1	R$175.33	R$112.13	R$171.23	1.215847	1.880065	Inf	NaN	3.617042	33.54303	Regular cluster
3	17916	19.2%	1.032317	1	R$175.86	R$111.62	R$170.73	1.094312	4.604260	49.17388	NaN	3.287432	113.83391	Regular cluster
2	56257	60.2%	1.033951	1	R$156.06	R$104.19	R$151.42	1.102432	4.743042	291.79117	NaN	2.744586	73.69074	Regular cluster

6 Segment Visualization

pca_fit <- prcomp(whitened_features, center = FALSE, scale. = FALSE)

pca_scores <- as_tibble(pca_fit$x[, 1:2]) %>%
  bind_cols(segmented_customers %>% select(customer_unique_id, cluster, segment_type, mahalanobis_distance))

plot_sample_n <- min(15000, nrow(pca_scores))

pca_scores %>%
  slice_sample(n = plot_sample_n) %>%
  ggplot(aes(PC1, PC2, color = cluster, shape = segment_type)) +
  geom_point(alpha = 0.45, size = 1.5) +
  labs(
    title = "Four Customer Segments in Mahalanobis-Whitened PCA Space",
    subtitle = paste0("Cluster ", outlier_cluster, " is labeled as the outlier cluster"),
    x = "Principal component 1",
    y = "Principal component 2",
    color = "Cluster",
    shape = "Segment type"
  )

segmented_customers %>%
  ggplot(aes(cluster, mahalanobis_distance, fill = segment_type)) +
  geom_boxplot(outlier.alpha = 0.35) +
  scale_y_continuous(labels = comma) +
  labs(
    title = "Mahalanobis Distance by Customer Cluster",
    subtitle = "The outlier cluster has the largest average distance from the global customer center",
    x = "Cluster",
    y = "Mahalanobis distance",
    fill = "Segment type"
  )

cluster_profiles %>%
  mutate(cluster = fct_reorder(cluster, customers)) %>%
  ggplot(aes(customers, cluster, fill = segment_type)) +
  geom_col() +
  geom_text(aes(label = comma(customers)), hjust = -0.08, size = 3.4) +
  scale_x_continuous(labels = comma, expand = expansion(mult = c(0, 0.12))) +
  labs(
    title = "Customer Count by Segment",
    x = "Customers",
    y = "Cluster",
    fill = "Segment type"
  )

7 Segment Profiles

profile_long <- segmented_customers %>%
  select(
    cluster,
    segment_type,
    orders,
    total_items,
    total_spend,
    avg_order_value,
    avg_items_per_order,
    avg_review_score,
    avg_delivery_days,
    late_delivery_rate,
    active_days,
    recency_days,
    freight_share,
    mahalanobis_distance
  ) %>%
  group_by(cluster, segment_type) %>%
  summarise(across(where(is.numeric), mean, na.rm = TRUE), .groups = "drop") %>%
  pivot_longer(
    cols = where(is.numeric),
    names_to = "feature",
    values_to = "value"
  ) %>%
  group_by(feature) %>%
  mutate(
    normalized_value = as.numeric(scale(value))
  ) %>%
  ungroup()

profile_long %>%
  ggplot(aes(feature, cluster, fill = normalized_value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(
    low = "#28666e",
    mid = "white",
    high = "#9a4d3f",
    midpoint = 0
  ) +
  labs(
    title = "Normalized Customer Segment Profiles",
    subtitle = "Red means the cluster average is higher than the feature average; teal means lower",
    x = "Feature",
    y = "Cluster",
    fill = "Scaled average"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

8 Outlier Customers

The following table shows customers in the outlier cluster with the largest Mahalanobis distances.

segmented_customers %>%
  filter(cluster == outlier_cluster) %>%
  arrange(desc(mahalanobis_distance)) %>%
  select(
    customer_unique_id,
    customer_state,
    cluster,
    mahalanobis_distance,
    orders,
    total_items,
    total_spend,
    avg_order_value,
    avg_review_score,
    recency_days,
    late_delivery_rate
  ) %>%
  slice_head(n = 25) %>%
  mutate(
    total_spend = dollar(total_spend, prefix = "R$"),
    avg_order_value = dollar(avg_order_value, prefix = "R$"),
    late_delivery_rate = percent(late_delivery_rate, accuracy = 0.1)
  ) %>%
  kable()

customer_unique_id	customer_state	cluster	mahalanobis_distance	orders	total_items	total_spend	avg_order_value	avg_review_score	recency_days	late_delivery_rate
a3dc27d87234f862d7e61a0148c77fa9	SP	1	30.60864	3	11	R$628.63	R$209.54	5.000000	138.202396	NA
310647380793836bfa5b7b6b3f518423	SP	1	29.43438	3	10	R$385.27	R$128.42	5.000000	315.059595	NA
1da09dd64e235e7c2f29a4faff33535c	RJ	1	29.37612	3	10	R$2,164.40	R$721.47	3.666667	230.155417	NA
d97b3cfb22b0d6b25ac9ed4e9c2d481b	SP	1	28.36879	2	13	R$1,044.13	R$522.07	1.000000	316.079248	NA
a40096fc0a3862e9e12bc55b5f8e6ab2	RJ	1	27.26608	3	8	R$457.18	R$152.39	1.000000	322.007488	NA
efce1ab3e96ccab8b1b464326bd22417	SP	1	26.97205	2	11	R$904.72	R$452.36	5.000000	155.021921	NA
6f90ec5150be36c4475835b5941ab56f	PR	1	26.43947	1	6	R$406.92	R$406.92	1.000000	688.683472	NA
70ad91901ad4330af432978ff134fc6f	SP	1	25.37495	2	11	R$660.44	R$330.22	5.000000	162.931366	NA
b18c8cbd9616a95055ef4146f3f805e1	SP	1	23.68545	2	9	R$242.19	R$121.10	4.000000	85.980127	NA
8dc697d03f771cecc2534534a73eaaf9	MT	1	23.52040	1	6	R$217.18	R$217.18	1.000000	5.080926	NA
41a3b256cc497dc952a815b848345cbc	RS	1	23.43077	1	7	R$421.69	R$421.69	1.000000	214.147280	NA
08a374bca4063116d5530a7b04ecaf3f	SC	1	21.99691	2	7	R$342.77	R$171.38	5.000000	16.103252	NA
8bd9c53ae5e194fd363ed6714235e6be	MG	1	21.47973	1	3	R$274.84	R$274.84	1.000000	240.078935	NA
e079b18794454de9d2be5c12b4392294	RJ	1	21.33996	2	6	R$437.78	R$218.89	5.000000	440.838981	NA
6dcf48fcc0c4d96dd7af8a0897ae9de8	SC	1	21.06965	2	7	R$447.28	R$223.64	3.000000	43.024144	NA
41b4af321ed477242aebe74652fd8219	SP	1	20.56987	2	7	R$254.23	R$127.12	5.000000	78.749526	NA
c8ed31310fc440a3f8031b177f9842c3	SP	1	20.45170	1	10	R$1,157.28	R$1,157.28	3.000000	17.534225	NA
ac95cdab584cac0e86ce97a5399ecf79	MG	1	20.11288	2	6	R$886.12	R$443.06	3.000000	143.097789	NA
76b61cd27ca2933a2bd2669762fb229a	RS	1	19.44616	1	4	R$214.19	R$214.19	1.000000	214.945174	NA
07547425c8f7a7086f529f2d3b80d21a	SP	1	19.44164	1	4	R$397.90	R$397.90	1.000000	77.960116	NA
d00cb93157046651ca3e21a3fc422413	RS	1	19.32545	1	4	R$547.29	R$547.29	1.000000	284.801724	NA
630331e73a79ae9a2ba821bd326ea298	SC	1	18.60385	2	5	R$233.07	R$116.54	5.000000	420.296100	NA
473d9165586cfc6708841d07adf6c020	SP	1	18.33637	2	5	R$645.98	R$322.99	4.500000	105.180637	NA
bf869f6a89c8ba217f47e22359f884f2	SP	1	18.27843	1	7	R$209.57	R$209.57	1.000000	13.438009	NA
ed4c785e11a78a73c48fa3af98008292	MG	1	18.25309	2	5	R$409.70	R$204.85	5.000000	102.017616	NA

9 Interpretation

outlier_profile <- cluster_profiles %>%
  filter(cluster == outlier_cluster)

largest_regular_cluster <- cluster_profiles %>%
  filter(cluster != outlier_cluster) %>%
  slice_max(customers, n = 1, with_ties = FALSE)

tibble(
  finding = c(
    "Number of clusters",
    "Outlier cluster",
    "Outlier-cluster size",
    "Outlier-cluster average Mahalanobis distance",
    "Largest regular cluster",
    "Largest regular-cluster size"
  ),
  value = c(
    k,
    as.character(outlier_cluster),
    paste0(comma(outlier_profile$customers), " customers"),
    comma(outlier_profile$avg_mahalanobis_distance, accuracy = 0.01),
    as.character(largest_regular_cluster$cluster),
    paste0(comma(largest_regular_cluster$customers), " customers")
  )
) %>%
  kable()

finding	value
Number of clusters	4
Outlier cluster	1
Outlier-cluster size	1,255 customers
Outlier-cluster average Mahalanobis distance	9.86
Largest regular cluster	2
Largest regular-cluster size	56,257 customers

The outlier cluster is not defined as “bad” customers. It is the group whose purchasing behavior is most statistically unusual after accounting for relationships among the features. These customers may be unusually high spenders, unusually frequent buyers, customers with large order values, customers with unusual delivery or review patterns, or a combination of these traits.

Customer Segmentation With Mahalanobis K-Means

Olist Customer Segmentation

2026-06-09