knitr::opts_chunk$set(
echo = TRUE,
message = FALSE,
warning = FALSE,
fig.width = 10,
fig.height = 6
)
# Install once if needed:
# install.packages(c("tidyverse", "janitor", "lubridate", "scales", "knitr"))
library(tidyverse)
library(janitor)
library(lubridate)
library(scales)
library(knitr)
theme_set(theme_minimal(base_size = 12))
set.seed(123)
This report creates four customer segments using k-means clustering with Mahalanobis distance logic.
Standard k-means minimizes Euclidean distance. To use Mahalanobis distance, the numeric customer feature matrix is whitened with the inverse square root of its covariance matrix. Euclidean k-means on the whitened matrix is equivalent to k-means in the original feature space using Mahalanobis distance.
The cluster with the largest average Mahalanobis distance from the global customer center is interpreted as the outlier cluster.
customers <- read_csv("olist_customers_dataset.csv", show_col_types = FALSE) %>%
clean_names()
orders <- read_csv("olist_orders_dataset.csv", show_col_types = FALSE) %>%
clean_names()
order_items <- read_csv("olist_order_items_dataset.csv", show_col_types = FALSE) %>%
clean_names()
payments <- read_csv("olist_order_payments_dataset.csv", show_col_types = FALSE) %>%
clean_names()
reviews <- read_csv("olist_order_reviews_dataset.csv", show_col_types = FALSE) %>%
clean_names()
list(
customers = dim(customers),
orders = dim(orders),
order_items = dim(order_items),
payments = dim(payments),
reviews = dim(reviews)
)
## $customers
## [1] 99441 5
##
## $orders
## [1] 99441 8
##
## $order_items
## [1] 112650 7
##
## $payments
## [1] 103886 5
##
## $reviews
## [1] 100000 7
orders_clean <- orders %>%
mutate(
order_purchase_timestamp = ymd_hms(order_purchase_timestamp),
order_delivered_customer_date = ymd_hms(order_delivered_customer_date),
order_estimated_delivery_date = ymd_hms(order_estimated_delivery_date),
delivered_late = case_when(
!is.na(order_delivered_customer_date) & !is.na(order_estimated_delivery_date) ~
as.numeric(order_delivered_customer_date > order_estimated_delivery_date),
TRUE ~ NA_real_
),
delivery_days = as.numeric(
difftime(order_delivered_customer_date, order_purchase_timestamp, units = "days")
)
)
item_order_summary <- order_items %>%
group_by(order_id) %>%
summarise(
items = n(),
item_spend = sum(price, na.rm = TRUE),
freight_spend = sum(freight_value, na.rm = TRUE),
sellers = n_distinct(seller_id),
products = n_distinct(product_id),
.groups = "drop"
)
payment_order_summary <- payments %>%
group_by(order_id) %>%
summarise(
payment_value = sum(payment_value, na.rm = TRUE),
avg_installments = mean(payment_installments, na.rm = TRUE),
payment_methods = n_distinct(payment_type),
.groups = "drop"
)
review_order_summary <- reviews %>%
group_by(order_id) %>%
summarise(
review_score = mean(review_score, na.rm = TRUE),
.groups = "drop"
)
order_analysis <- orders_clean %>%
left_join(customers, by = "customer_id") %>%
left_join(item_order_summary, by = "order_id") %>%
left_join(payment_order_summary, by = "order_id") %>%
left_join(review_order_summary, by = "order_id") %>%
filter(order_status == "delivered") %>%
mutate(
items = replace_na(items, 0),
item_spend = replace_na(item_spend, 0),
freight_spend = replace_na(freight_spend, 0),
payment_value = coalesce(payment_value, item_spend + freight_spend),
sellers = replace_na(sellers, 0),
products = replace_na(products, 0),
payment_methods = replace_na(payment_methods, 0)
)
analysis_end_date <- max(order_analysis$order_purchase_timestamp, na.rm = TRUE)
customer_features <- order_analysis %>%
group_by(customer_unique_id, customer_state) %>%
summarise(
orders = n_distinct(order_id),
total_items = sum(items, na.rm = TRUE),
total_products = sum(products, na.rm = TRUE),
total_spend = sum(payment_value, na.rm = TRUE),
item_spend = sum(item_spend, na.rm = TRUE),
freight_spend = sum(freight_spend, na.rm = TRUE),
avg_order_value = mean(payment_value, na.rm = TRUE),
avg_items_per_order = mean(items, na.rm = TRUE),
avg_sellers_per_order = mean(sellers, na.rm = TRUE),
avg_installments = mean(avg_installments, na.rm = TRUE),
avg_review_score = mean(review_score, na.rm = TRUE),
avg_delivery_days = mean(delivery_days, na.rm = TRUE),
late_delivery_rate = mean(delivered_late, na.rm = TRUE),
first_purchase = min(order_purchase_timestamp, na.rm = TRUE),
last_purchase = max(order_purchase_timestamp, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(
active_days = pmax(
as.numeric(difftime(last_purchase, first_purchase, units = "days")),
0
),
recency_days = as.numeric(difftime(analysis_end_date, last_purchase, units = "days")),
freight_share = freight_spend / pmax(total_spend, 1)
)
customer_features %>%
select(
customer_unique_id,
customer_state,
orders,
total_items,
total_spend,
avg_order_value,
avg_review_score,
recency_days
) %>%
slice_head(n = 10) %>%
kable()
| customer_unique_id | customer_state | orders | total_items | total_spend | avg_order_value | avg_review_score | recency_days |
|---|---|---|---|---|---|---|---|
| 0000366f3b9a7992bf8c76cfdf3221e2 | SP | 1 | 1 | 141.90 | 141.90 | 5 | 111.1696 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | SP | 1 | 1 | 27.19 | 27.19 | 4 | 114.1591 |
| 0000f46a3911fa3c0805444483337064 | SC | 1 | 1 | 86.22 | 86.22 | 3 | 536.7469 |
| 0000f6ccb0745a6a4b88665a16c9f078 | PA | 1 | 1 | 43.62 | 43.62 | 4 | 320.7715 |
| 0004aac84e0df4da2b147fca70cf8255 | SP | 1 | 1 | 196.89 | 196.89 | 5 | 287.8020 |
| 0004bd2a26a76fe21f786e4fbd80607f | SP | 1 | 1 | 166.98 | 166.98 | 4 | 145.8107 |
| 00050ab1314c0e55a6ca13cf7181fecf | SP | 1 | 1 | 35.38 | 35.38 | 4 | 131.0856 |
| 00053a61a98854899e70ed204dd4bafe | PR | 1 | 2 | 419.18 | 419.18 | 1 | 182.1562 |
| 0005e1862207bf6ccc02e4228effd9a0 | RJ | 1 | 1 | 150.12 | 150.12 | 4 | 542.6447 |
| 0005ef4cd20d2893f0d9fbd94d3c0d97 | MA | 1 | 1 | 129.76 | 129.76 | 1 | 169.9850 |
The features below are transformed before clustering. Highly skewed
monetary and count variables use log1p. Missing or infinite
values are replaced with medians.
replace_bad_numeric <- function(x) {
x[is.nan(x) | is.infinite(x)] <- NA_real_
fallback <- median(x, na.rm = TRUE)
if (is.na(fallback) || is.nan(fallback) || is.infinite(fallback)) {
fallback <- 0
}
replace_na(x, fallback)
}
cluster_input <- customer_features %>%
select(
orders,
total_items,
total_products,
total_spend,
item_spend,
freight_spend,
avg_order_value,
avg_items_per_order,
avg_sellers_per_order,
avg_installments,
avg_review_score,
avg_delivery_days,
late_delivery_rate,
active_days,
recency_days,
freight_share
) %>%
mutate(
across(
c(
orders,
total_items,
total_products,
total_spend,
item_spend,
freight_spend,
avg_order_value,
avg_items_per_order,
avg_sellers_per_order,
avg_installments,
avg_delivery_days,
active_days,
recency_days
),
log1p
),
across(everything(), replace_bad_numeric)
)
feature_center <- colMeans(cluster_input)
feature_covariance <- cov(cluster_input)
# A small ridge term keeps the covariance matrix invertible when features are correlated.
ridge <- 1e-6
regularized_covariance <- feature_covariance +
diag(ridge, ncol(feature_covariance))
eig <- eigen(regularized_covariance)
inverse_sqrt_covariance <- eig$vectors %*%
diag(1 / sqrt(pmax(eig$values, ridge))) %*%
t(eig$vectors)
whitened_features <- sweep(as.matrix(cluster_input), 2, feature_center, "-") %*%
inverse_sqrt_covariance
colnames(whitened_features) <- paste0("z_", names(cluster_input))
customer_mahalanobis <- sqrt(rowSums(whitened_features^2))
summary(customer_mahalanobis)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.8343 1.9408 2.5011 3.1121 3.3474 113.8339
k <- 4
kmeans_fit <- kmeans(
whitened_features,
centers = k,
nstart = 10,
iter.max = 50
)
segmented_customers <- customer_features %>%
mutate(
cluster = factor(kmeans_fit$cluster),
mahalanobis_distance = customer_mahalanobis,
mahalanobis_distance_sq = mahalanobis_distance^2
)
cluster_profiles <- segmented_customers %>%
group_by(cluster) %>%
summarise(
customers = n(),
share_of_customers = customers / nrow(segmented_customers),
avg_orders = mean(orders),
median_orders = median(orders),
avg_total_spend = mean(total_spend),
median_total_spend = median(total_spend),
avg_order_value = mean(avg_order_value),
avg_items_per_order = mean(avg_items_per_order),
avg_review_score = mean(avg_review_score, na.rm = TRUE),
avg_recency_days = mean(recency_days),
avg_late_delivery_rate = mean(late_delivery_rate, na.rm = TRUE),
avg_mahalanobis_distance = mean(mahalanobis_distance),
max_mahalanobis_distance = max(mahalanobis_distance),
.groups = "drop"
) %>%
arrange(desc(avg_mahalanobis_distance))
outlier_cluster <- cluster_profiles %>%
slice_max(avg_mahalanobis_distance, n = 1, with_ties = FALSE) %>%
pull(cluster)
cluster_profiles <- cluster_profiles %>%
mutate(
segment_type = if_else(cluster == outlier_cluster, "Outlier cluster", "Regular cluster")
)
segmented_customers <- segmented_customers %>%
mutate(
segment_type = if_else(cluster == outlier_cluster, "Outlier cluster", "Regular cluster")
)
cluster_profiles %>%
mutate(
share_of_customers = percent(share_of_customers, accuracy = 0.1),
avg_total_spend = dollar(avg_total_spend, prefix = "R$"),
median_total_spend = dollar(median_total_spend, prefix = "R$"),
avg_order_value = dollar(avg_order_value, prefix = "R$")
) %>%
kable()
| cluster | customers | share_of_customers | avg_orders | median_orders | avg_total_spend | median_total_spend | avg_order_value | avg_items_per_order | avg_review_score | avg_recency_days | avg_late_delivery_rate | avg_mahalanobis_distance | max_mahalanobis_distance | segment_type |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1255 | 1.3% | 1.099602 | 1 | R$272.81 | R$214.15 | R$253.68 | 2.356707 | 2.864210 | 204.00855 | NaN | 9.856549 | 30.60864 | Outlier cluster |
| 4 | 17968 | 19.2% | 1.026046 | 1 | R$175.33 | R$112.13 | R$171.23 | 1.215847 | 1.880065 | Inf | NaN | 3.617042 | 33.54303 | Regular cluster |
| 3 | 17916 | 19.2% | 1.032317 | 1 | R$175.86 | R$111.62 | R$170.73 | 1.094312 | 4.604260 | 49.17388 | NaN | 3.287432 | 113.83391 | Regular cluster |
| 2 | 56257 | 60.2% | 1.033951 | 1 | R$156.06 | R$104.19 | R$151.42 | 1.102432 | 4.743042 | 291.79117 | NaN | 2.744586 | 73.69074 | Regular cluster |
pca_fit <- prcomp(whitened_features, center = FALSE, scale. = FALSE)
pca_scores <- as_tibble(pca_fit$x[, 1:2]) %>%
bind_cols(segmented_customers %>% select(customer_unique_id, cluster, segment_type, mahalanobis_distance))
plot_sample_n <- min(15000, nrow(pca_scores))
pca_scores %>%
slice_sample(n = plot_sample_n) %>%
ggplot(aes(PC1, PC2, color = cluster, shape = segment_type)) +
geom_point(alpha = 0.45, size = 1.5) +
labs(
title = "Four Customer Segments in Mahalanobis-Whitened PCA Space",
subtitle = paste0("Cluster ", outlier_cluster, " is labeled as the outlier cluster"),
x = "Principal component 1",
y = "Principal component 2",
color = "Cluster",
shape = "Segment type"
)
segmented_customers %>%
ggplot(aes(cluster, mahalanobis_distance, fill = segment_type)) +
geom_boxplot(outlier.alpha = 0.35) +
scale_y_continuous(labels = comma) +
labs(
title = "Mahalanobis Distance by Customer Cluster",
subtitle = "The outlier cluster has the largest average distance from the global customer center",
x = "Cluster",
y = "Mahalanobis distance",
fill = "Segment type"
)
cluster_profiles %>%
mutate(cluster = fct_reorder(cluster, customers)) %>%
ggplot(aes(customers, cluster, fill = segment_type)) +
geom_col() +
geom_text(aes(label = comma(customers)), hjust = -0.08, size = 3.4) +
scale_x_continuous(labels = comma, expand = expansion(mult = c(0, 0.12))) +
labs(
title = "Customer Count by Segment",
x = "Customers",
y = "Cluster",
fill = "Segment type"
)
profile_long <- segmented_customers %>%
select(
cluster,
segment_type,
orders,
total_items,
total_spend,
avg_order_value,
avg_items_per_order,
avg_review_score,
avg_delivery_days,
late_delivery_rate,
active_days,
recency_days,
freight_share,
mahalanobis_distance
) %>%
group_by(cluster, segment_type) %>%
summarise(across(where(is.numeric), mean, na.rm = TRUE), .groups = "drop") %>%
pivot_longer(
cols = where(is.numeric),
names_to = "feature",
values_to = "value"
) %>%
group_by(feature) %>%
mutate(
normalized_value = as.numeric(scale(value))
) %>%
ungroup()
profile_long %>%
ggplot(aes(feature, cluster, fill = normalized_value)) +
geom_tile(color = "white") +
scale_fill_gradient2(
low = "#28666e",
mid = "white",
high = "#9a4d3f",
midpoint = 0
) +
labs(
title = "Normalized Customer Segment Profiles",
subtitle = "Red means the cluster average is higher than the feature average; teal means lower",
x = "Feature",
y = "Cluster",
fill = "Scaled average"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
The following table shows customers in the outlier cluster with the largest Mahalanobis distances.
segmented_customers %>%
filter(cluster == outlier_cluster) %>%
arrange(desc(mahalanobis_distance)) %>%
select(
customer_unique_id,
customer_state,
cluster,
mahalanobis_distance,
orders,
total_items,
total_spend,
avg_order_value,
avg_review_score,
recency_days,
late_delivery_rate
) %>%
slice_head(n = 25) %>%
mutate(
total_spend = dollar(total_spend, prefix = "R$"),
avg_order_value = dollar(avg_order_value, prefix = "R$"),
late_delivery_rate = percent(late_delivery_rate, accuracy = 0.1)
) %>%
kable()
| customer_unique_id | customer_state | cluster | mahalanobis_distance | orders | total_items | total_spend | avg_order_value | avg_review_score | recency_days | late_delivery_rate |
|---|---|---|---|---|---|---|---|---|---|---|
| a3dc27d87234f862d7e61a0148c77fa9 | SP | 1 | 30.60864 | 3 | 11 | R$628.63 | R$209.54 | 5.000000 | 138.202396 | NA |
| 310647380793836bfa5b7b6b3f518423 | SP | 1 | 29.43438 | 3 | 10 | R$385.27 | R$128.42 | 5.000000 | 315.059595 | NA |
| 1da09dd64e235e7c2f29a4faff33535c | RJ | 1 | 29.37612 | 3 | 10 | R$2,164.40 | R$721.47 | 3.666667 | 230.155417 | NA |
| d97b3cfb22b0d6b25ac9ed4e9c2d481b | SP | 1 | 28.36879 | 2 | 13 | R$1,044.13 | R$522.07 | 1.000000 | 316.079248 | NA |
| a40096fc0a3862e9e12bc55b5f8e6ab2 | RJ | 1 | 27.26608 | 3 | 8 | R$457.18 | R$152.39 | 1.000000 | 322.007488 | NA |
| efce1ab3e96ccab8b1b464326bd22417 | SP | 1 | 26.97205 | 2 | 11 | R$904.72 | R$452.36 | 5.000000 | 155.021921 | NA |
| 6f90ec5150be36c4475835b5941ab56f | PR | 1 | 26.43947 | 1 | 6 | R$406.92 | R$406.92 | 1.000000 | 688.683472 | NA |
| 70ad91901ad4330af432978ff134fc6f | SP | 1 | 25.37495 | 2 | 11 | R$660.44 | R$330.22 | 5.000000 | 162.931366 | NA |
| b18c8cbd9616a95055ef4146f3f805e1 | SP | 1 | 23.68545 | 2 | 9 | R$242.19 | R$121.10 | 4.000000 | 85.980127 | NA |
| 8dc697d03f771cecc2534534a73eaaf9 | MT | 1 | 23.52040 | 1 | 6 | R$217.18 | R$217.18 | 1.000000 | 5.080926 | NA |
| 41a3b256cc497dc952a815b848345cbc | RS | 1 | 23.43077 | 1 | 7 | R$421.69 | R$421.69 | 1.000000 | 214.147280 | NA |
| 08a374bca4063116d5530a7b04ecaf3f | SC | 1 | 21.99691 | 2 | 7 | R$342.77 | R$171.38 | 5.000000 | 16.103252 | NA |
| 8bd9c53ae5e194fd363ed6714235e6be | MG | 1 | 21.47973 | 1 | 3 | R$274.84 | R$274.84 | 1.000000 | 240.078935 | NA |
| e079b18794454de9d2be5c12b4392294 | RJ | 1 | 21.33996 | 2 | 6 | R$437.78 | R$218.89 | 5.000000 | 440.838981 | NA |
| 6dcf48fcc0c4d96dd7af8a0897ae9de8 | SC | 1 | 21.06965 | 2 | 7 | R$447.28 | R$223.64 | 3.000000 | 43.024144 | NA |
| 41b4af321ed477242aebe74652fd8219 | SP | 1 | 20.56987 | 2 | 7 | R$254.23 | R$127.12 | 5.000000 | 78.749526 | NA |
| c8ed31310fc440a3f8031b177f9842c3 | SP | 1 | 20.45170 | 1 | 10 | R$1,157.28 | R$1,157.28 | 3.000000 | 17.534225 | NA |
| ac95cdab584cac0e86ce97a5399ecf79 | MG | 1 | 20.11288 | 2 | 6 | R$886.12 | R$443.06 | 3.000000 | 143.097789 | NA |
| 76b61cd27ca2933a2bd2669762fb229a | RS | 1 | 19.44616 | 1 | 4 | R$214.19 | R$214.19 | 1.000000 | 214.945174 | NA |
| 07547425c8f7a7086f529f2d3b80d21a | SP | 1 | 19.44164 | 1 | 4 | R$397.90 | R$397.90 | 1.000000 | 77.960116 | NA |
| d00cb93157046651ca3e21a3fc422413 | RS | 1 | 19.32545 | 1 | 4 | R$547.29 | R$547.29 | 1.000000 | 284.801724 | NA |
| 630331e73a79ae9a2ba821bd326ea298 | SC | 1 | 18.60385 | 2 | 5 | R$233.07 | R$116.54 | 5.000000 | 420.296100 | NA |
| 473d9165586cfc6708841d07adf6c020 | SP | 1 | 18.33637 | 2 | 5 | R$645.98 | R$322.99 | 4.500000 | 105.180637 | NA |
| bf869f6a89c8ba217f47e22359f884f2 | SP | 1 | 18.27843 | 1 | 7 | R$209.57 | R$209.57 | 1.000000 | 13.438009 | NA |
| ed4c785e11a78a73c48fa3af98008292 | MG | 1 | 18.25309 | 2 | 5 | R$409.70 | R$204.85 | 5.000000 | 102.017616 | NA |
outlier_profile <- cluster_profiles %>%
filter(cluster == outlier_cluster)
largest_regular_cluster <- cluster_profiles %>%
filter(cluster != outlier_cluster) %>%
slice_max(customers, n = 1, with_ties = FALSE)
tibble(
finding = c(
"Number of clusters",
"Outlier cluster",
"Outlier-cluster size",
"Outlier-cluster average Mahalanobis distance",
"Largest regular cluster",
"Largest regular-cluster size"
),
value = c(
k,
as.character(outlier_cluster),
paste0(comma(outlier_profile$customers), " customers"),
comma(outlier_profile$avg_mahalanobis_distance, accuracy = 0.01),
as.character(largest_regular_cluster$cluster),
paste0(comma(largest_regular_cluster$customers), " customers")
)
) %>%
kable()
| finding | value |
|---|---|
| Number of clusters | 4 |
| Outlier cluster | 1 |
| Outlier-cluster size | 1,255 customers |
| Outlier-cluster average Mahalanobis distance | 9.86 |
| Largest regular cluster | 2 |
| Largest regular-cluster size | 56,257 customers |
The outlier cluster is not defined as “bad” customers. It is the group whose purchasing behavior is most statistically unusual after accounting for relationships among the features. These customers may be unusually high spenders, unusually frequent buyers, customers with large order values, customers with unusual delivery or review patterns, or a combination of these traits.