Predictive Modelling & Segmentation Case Study

Author

Data Science Team

Published

April 24, 2026

Keep only libraries and data prep here. NO PLOTTING CODE.

library(tidyverse) library(cluster) library(factoextra) library(tidymodels)

set.seed(42)

Create synthetic data

n <- 300 customer_data <- tibble( customer_id = 1:n, recency = c(rnorm(n0.7, 20, 10), rnorm(n0.3, 100, 20)), frequency = c(rnorm(n0.7, 50, 15), rnorm(n0.3, 5, 5)), monetary = c(rnorm(n0.7, 500, 100), rnorm(n0.3, 50, 20)) ) %>% mutate(across(everything(), ~abs(.)))

Scale data

scaled_data <- customer_data %>% select(recency, frequency, monetary) %>% scale() # This chunk WILL show because it is not labeled ‘setup’ fviz_nbclust(scaled_data, kmeans, method = “wss”) + labs(title = “Elbow Method for Optimal K”) # Perform clustering km_res <- kmeans(scaled_data, centers = 3, nstart = 25) customer_data\(segment <- as.factor(km_res\)cluster)

Visualize Clusters

fviz_cluster(km_res, data = scaled_data, geom = “point”, ellipse.type = “convex”, main = “Customer Segments”) #| warning: false #| message: false

Calculate Score

customer_data <- customer_data %>% mutate(propensity_score = (frequency * 0.6) - (recency * 0.4))

Plot

ggplot(customer_data, aes(x = propensity_score, y = monetary, color = segment)) + geom_point(size = 3, alpha = 0.7) + theme_minimal() + labs(title = “Unsupervised Segments within Supervised Propensity”, subtitle = “Notice how clusters overlap on the score but are distinct in behavior”, x = “Supervised Propensity Score”, y = “Monetary Value”)