Predictive Modelling & Segmentation Case Study

Author

Data Science Team

Published

April 24, 2026

Load necessary libraries

library(tidyverse) library(cluster) library(factoextra) library(caret) library(tidymodels)

set.seed(42)

Create synthetic data

n <- 300 customer_data <- tibble( customer_id = 1:n, recency = c(rnorm(n0.7, 20, 10), rnorm(n0.3, 100, 20)), frequency = c(rnorm(n0.7, 50, 15), rnorm(n0.3, 5, 5)), monetary = c(rnorm(n0.7, 500, 100), rnorm(n0.3, 50, 20)) ) %>% mutate(across(everything(), ~abs(.))) # Ensure positive values

Scale data for K-Means

scaled_data <- customer_data %>% select(recency, frequency, monetary) %>% scale()

Determine optimal clusters

fviz_nbclust(scaled_data, kmeans, method = “wss”) + labs(title = “Elbow Method for Optimal K”)

Fit K-Means with k=3

km_res <- kmeans(scaled_data, centers = 3, nstart = 25) customer_data\(segment <- as.factor(km_res\)cluster)

Visualize Clusters

fviz_cluster(km_res, data = scaled_data, geom = “point”, ellipse.type = “convex”, main = “Customer Segments”)

Simulate a Supervised ‘Propensity Score’

Supervised assumes: Higher Frequency + Lower Recency = High Value

customer_data <- customer_data %>% mutate(propensity_score = (frequency * 0.6) - (recency * 0.4))

Plotting the difference

#| echo: false #| warning: false #| message: false ggplot(customer_data, aes(x = propensity_score, y = monetary, color = segment)) + geom_point(size = 3, alpha = 0.7) + theme_minimal() + labs(title = “Unsupervised Segments within Supervised Propensity”, subtitle = “Notice how clusters overlap on the score but are distinct in behavior”, x = “Supervised Propensity Score”, y = “Monetary Value”)