Predictive Modelling & Segmentation Case Study

Author

Data Science Team

Published

April 24, 2026

library(tidyverse) library(cluster) library(factoextra) library(tidymodels)

set.seed(42)

Create synthetic data

n <- 300 customer_data <- tibble( customer_id = 1:n, recency = c(rnorm(n0.7, 20, 10), rnorm(n0.3, 100, 20)), frequency = c(rnorm(n0.7, 50, 15), rnorm(n0.3, 5, 5)), monetary = c(rnorm(n0.7, 500, 100), rnorm(n0.3, 50, 20)) ) %>% mutate(across(everything(), ~abs(.)))

Scale data

scaled_data <- customer_data %>% select(recency, frequency, monetary) %>% scale()

—————————–

Elbow Method Plot

—————————–

print( fviz_nbclust(scaled_data, kmeans, method = “wss”) + labs(title = “Elbow Method for Optimal K”) )

—————————–

K-Means Clustering

—————————–

km_res <- kmeans(scaled_data, centers = 3, nstart = 25) customer_data\(segment <- as.factor(km_res\)cluster)

—————————–

Cluster Visualization

—————————–

print( fviz_cluster( km_res, data = scaled_data, geom = “point”, ellipse.type = “convex”, main = “Customer Segments” ) )

—————————–

Propensity Score

—————————–

customer_data <- customer_data %>% mutate(propensity_score = (frequency * 0.6) - (recency * 0.4))

—————————–

Final Combined Plot

—————————–

print( ggplot(customer_data, aes(x = propensity_score, y = monetary, color = segment)) + geom_point(size = 3, alpha = 0.7) + theme_minimal() + labs( title = “Unsupervised Segments within Supervised Propensity”, subtitle = “Clusters overlap on score but remain behaviorally distinct”, x = “Supervised Propensity Score”, y = “Monetary Value” ) )