Customer personality analysis: Cluster analysis and customer ranking

Lim Jia Qi

5/24/2022

Load library packages

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(e1071)   # for skewness calculation
require(cluster)  # pam algorithm
## Loading required package: cluster
library(fpc)      # cluster evaluation metrics
library(mdw)      # entropy-based feature weights
library(MCDA)      # TOPSIS
library(FactoMineR)  # Factor analysis for mixed data (FAMD)
library(factoextra)  
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(knitr)       # For beatiful table display
library(car)         # For interactive 3D scatter plot
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows

Load the previously saved data

# set the working directory
setwd("~/ml projects/customer segmentation")
cust_data = readRDS("preprocessed_cust_data.rds")

Cluster analysis

Dissimilarity matrix computation using gower distance measure

For mixed data types daisy() function from cluster package is used to calculate the pairwise dissimilarity between observations. type argument in the daisy function is used to specify the types of variables in the input cust_data_without_ID. More info regarding the daisy function can be obtained from the help() function.

# Cluster analysis with k-medoids (PAM) algorithm
idx_col_retain = !(colnames(cust_data) %in% c("ID", "min_num_household", "tot_AcceptedCmp"))
cust_data_without_ID = cust_data[,idx_col_retain]

# Boolean attributes
seq_binary = c("Complain", "Response", "Accepted")
idx_binary = sapply(seq_binary, 
                    function(x) grep(x, colnames(cust_data_without_ID)))
idx_binary = unlist(idx_binary)

# continuos attributes
cont_features_patterns = c("Mnt", "Num", "Income","Recency", "age", 
                           "days_enroll") 
idx_cont = sapply(cont_features_patterns, 
                  function(x) grep(x, colnames(cust_data_without_ID)))
idx_cont = unlist(idx_cont)

skewness_col = apply(cust_data_without_ID[, idx_cont], 2, skewness)
idx_logtrans = idx_cont[which(abs(skewness_col)>1)]

# Ordinal attributes

dissimilarity_matrix = daisy(cust_data_without_ID, metric = "gower", 
                             type = list(ordratio=grep("home", colnames(cust_data)),
                                         asymm = idx_binary,
                                         logratio = idx_logtrans))

K-medoid clustering (PAM algorithm)

Selection of number of clusters, k using cluster evaluation measures

Cluster evaluation can be divided into 3 categories:

  1. Internal: Typical objective functions in clustering formalize the goal of attaining high intra-cluster similarity (data within a cluster are similar) and low inter-cluster similarity (data from different clusters are dissimilar). The similarity Numerical measure of how alike two data instances are normally quantified by distance measures (Euclidean distance).
  2. External: Ground truth or gold standard should be available to evaluate how well the clustering results match with the ground truth labels.

I will be using internal cluster evaluation metrics found in fpc package, such as elbow method, average silhouette width, Calinski-Harabasz index and Dunn index to find the optimal number of cluster.

# Number of clusters are suspected to be in the range of 2 to 8. 
k_array = seq(from = 2, to = 8, by=1)
cluster_eval_df = data.frame(matrix(, nrow = length(k_array), ncol = 4))
colnames(cluster_eval_df) = c("silhouette", "CH_index", "Dunn_index", "wc_SOS")
cluster_eval_df$k = k_array

for (i in (1:length(k_array))){
  set.seed(i+100)
  kmedoid = pam(dissimilarity_matrix, k = k_array[i], diss = TRUE,
                nstart = 10)
  # set diss to TRUE, and set the number of random start as 10.
  clust_stat = cluster.stats(dissimilarity_matrix, 
                             clustering = kmedoid$clustering)
  cluster_eval_df[i,"silhouette"] = clust_stat$avg.silwidth
  cluster_eval_df[i, "CH_index"] = clust_stat$ch
  cluster_eval_df[i, "Dunn_index"] = clust_stat$dunn
  # Add in the within cluster sum of squares
  cluster_eval_df[i, "wc_SOS"] = clust_stat$within.cluster.ss
}

# Line plot for all evaluation metrics (internal cluster evaluation)
cluster_eval_df %>% 
  pivot_longer(!k, names_to = "cluster_eval", values_to = "value") %>% 
  ggplot(aes(x = k, y = value)) +
  geom_line() + geom_point() + facet_grid(rows = vars(cluster_eval), 
                                          scales = "free_y")

# find the best_k from stackoverflow using the mode function found on
# https://stackoverflow.com/questions/2547402/how-to-find-the-statistical-mode?page=1&tab=scoredesc#tab-top
mode = function(x){
  ux = unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}
best_k = mode(c(which.max(cluster_eval_df$silhouette), 
                which.max(cluster_eval_df$CH_index),
                which.max(cluster_eval_df$Dunn_index))) + 1

Final clustering model

set.seed(100 + best_k - 1)
kmedoid = pam(dissimilarity_matrix, k = best_k, diss = TRUE, nstart = 10)
# Save the cluster label in the dataframe. Change it to factor to facilitate data wrangling.
cust_data$cluster_label = as.factor(kmedoid$clustering)

Cluster analysis

Empirical analysis (Statistical summaries) of each cluster

#1: Number of observations (customers)
cust_data %>% group_by(cluster_label) %>% 
  summarise(n = n()) %>% kable(caption = "Number of observations in each cluster")
Number of observations in each cluster
cluster_label n
1 739
2 833
3 662
#2: Average of numerical features for each cluster
summary_cont_features_per_cluster = 
  cust_data %>% group_by(cluster_label) %>% 
  summarise_if(is.numeric, mean) %>% select(-ID)
kable(summary_cont_features_per_cluster, "html", caption = "mean attributes for each cluster") %>% kable_styling("striped") %>% scroll_box(width = "100%")
mean attributes for each cluster
cluster_label Income Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth min_num_household tot_AcceptedCmp age days_enroll
1 67575.36 49.68200 484.30311 50.635995 321.79838 71.07713 51.048715 72.24357 1.807848 5.185386 4.5575101 7.801082 3.746955 2.069012 0.6968877 46.78214 362.2679
2 33797.88 47.93758 73.59784 7.223289 39.27131 10.43938 8.028811 21.37815 2.500600 2.566627 0.7394958 3.480192 6.830732 2.926771 0.2292917 40.00840 350.3902
3 55723.21 50.10725 393.01813 23.126888 155.21601 34.29154 24.370091 40.88520 2.676737 4.767372 2.9743202 6.469788 5.163142 2.768882 0.4425982 49.64804 354.4955
# can use list() for many variables 


#3: Distribution of categorical features for each cluster
cate_features_names = names(Filter(is.factor, cust_data))
# Change the second argument of group_by() and third argument aes() to   desired categorical feature names: Education, Marital_status, Kidhome,   Teenhome.
cust_data %>% select(one_of(cate_features_names)) %>% 
  group_by(cluster_label, Teenhome) %>% summarise(n = n()) %>% 
  mutate(prop = (n/sum(n))) %>% 
  ggplot(aes(x = cluster_label, y = prop, fill = Teenhome)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  geom_text(aes(label = round(prop, 2)), vjust = 1.5, size = 2,
            position = position_dodge(0.9)) +
  theme_minimal()
## `summarise()` has grouped output by 'cluster_label'. You can override using the
## `.groups` argument.

#4: Distribution of minimum number of household members for each cluster
cust_data$min_num_household = factor(cust_data$min_num_household, ordered = TRUE)
cust_data %>% group_by(cluster_label, min_num_household) %>% summarise(n = n()) %>% 
  mutate(prop = (n/sum(n))) %>% 
  ggplot(aes(x = cluster_label, y = prop, fill = min_num_household)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  geom_text(aes(label = round(prop, 2)), vjust = 1.5, size = 2,
            position = position_dodge(0.9)) +
  theme_minimal()
## `summarise()` has grouped output by 'cluster_label'. You can override using the
## `.groups` argument.

#5: Tabulate binary(Boolean) variable for each cluster
table1 = 
  cust_data %>% 
  group_by(cluster_label, Complain) %>% summarise(n = n()) %>% 
  mutate(prop = (n/sum(n))) %>% select(-n)
## `summarise()` has grouped output by 'cluster_label'. You can override using the
## `.groups` argument.
colnames(table1)[2] = "Binary_outcomes"

#pattern = c("Complain", "Response", "Accept", "cluster")
#idx_select = lapply(pattern, function (x) grep(x, colnames(cust_data)))
#idx = unlist(idx_select)
cust_data_1 = cust_data %>% 
  select(contains("Response") | contains("Accept") | contains("cluster"))

n_col = ncol(cust_data_1)
for (i in (1:(n_col-2))){
  table2 = cust_data_1 %>%  select(c(n_col, all_of(i))) %>% 
    group_by_all() %>% 
    summarise(n = n()) %>% 
    mutate(prop = (n/sum(n))) %>% 
    select(-n)
  colnames(table2)[2] = "Binary_outcomes"
  table1 = table1 %>% 
    left_join(table2, by = c("cluster_label" = "cluster_label", 
                             "Binary_outcomes" = "Binary_outcomes"))
}
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(n_col)` instead of `n_col` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## `summarise()` has grouped output by 'cluster_label'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'cluster_label'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'cluster_label'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'cluster_label'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'cluster_label'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'cluster_label'. You can override using the `.groups` argument.
oldname = colnames(table1)[3:length(colnames(table1))]
newname = c("Complain", 
            colnames(cust_data_1)[-length(colnames(cust_data_1))])
# Rename
for (i in (1:length(newname))){
  names(table1)[names(table1) == oldname[i]] = newname[i]
}
table1 %>% kable("html", caption = "Proportions of Boolean attributes of each cluster") %>% kable_styling("striped") %>% scroll_box(width = "100%")
Proportions of Boolean attributes of each cluster
cluster_label Binary_outcomes Complain Response AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2
1 0 0.9932341 0.8159675 0.9269283 0.8849797 0.8349120 0.8619756 0.9783491
1 1 0.0067659 0.1840325 0.0730717 0.1150203 0.1650880 0.1380244 0.0216509
2 0 0.9867947 0.8835534 0.9207683 0.9819928 0.9951981 0.9891957 1.0000000
2 1 0.0132053 0.1164466 0.0792317 0.0180072 0.0048019 0.0108043 NA
3 0 0.9939577 0.8489426 0.9350453 0.8987915 0.9456193 0.9501511 0.9788520
3 1 0.0060423 0.1510574 0.0649547 0.1012085 0.0543807 0.0498489 0.0211480

Principal component analysis (PCA) for mixed data

Visualize the data in lower dimensional space.

res.famd = FAMD(cust_data_without_ID, graph = F)

eig.value = get_eigenvalue(res.famd)
head(eig.value)
##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1   6.743189        18.731080                    18.73108
## Dim.2   2.205379         6.126053                    24.85713
## Dim.3   1.939483         5.387452                    30.24458
## Dim.4   1.688255         4.689596                    34.93418
## Dim.5   1.231814         3.421705                    38.35589
fviz_screeplot(res.famd)

# predict to get the transformed feature space and plot on 3 dimensional scatter plot
transformed_data = predict(res.famd, cust_data_without_ID)

The code snippet below is in reference to a stackoverflow post

options(rgl.useNULL = TRUE)
library(rgl)
## Warning: package 'rgl' was built under R version 4.1.3
scatter3d(x = transformed_data$coord[,1], y = transformed_data$coord[,2],
          z = transformed_data$coord[,3], 
          groups = cust_data$cluster_label, grid = FALSE, 
          surface = FALSE, ellipsoid = TRUE, 
          surface.col = c("#80FF00", "#009999", "#FF007F"))
## Loading required namespace: mgcv
rglwidget()

**A pop-up window will appear in Rstudio if you run the above code chunk and you can freely rotate the interactive 3D scatter plot.

Customer ranking

TOPSIS

To rank alternatives for each cluster based on features: Mnt, Num, Accepted and Response.

Entropy-based feature weighting

# Calculate feature weights for continuous and categorical features for first cluster
i = 1
data_analysis_cont = cust_data %>% filter(cluster_label==i) %>% dplyr::select(starts_with(c("Mnt", "Num")))
data_analysis_cate = cust_data %>% filter(cluster_label==i) %>% 
  dplyr::select(starts_with(c("Accepted", "Response")))

h_cont = get.bw(scale(data_analysis_cont), bw = "nrd", nb = 'na')
w_cont = entropy.weight(scale(data_analysis_cont), h = h_cont)
w_cate = entropy.weight(data_analysis_cate, h='na')

Customer ranking

Customer ranking for 1st cluster.

data_analysis_cate <- lapply(data_analysis_cate, 
                             function(x) as.numeric(as.character(x)))
data_analysis = cbind(data_analysis_cont, data_analysis_cate)

feat_imp = c(w_cont, w_cate)
overall = TOPSIS(data_analysis, 
                 feat_imp, 
                 criteriaMinMax = rep("max", length(feat_imp)))
data_analysis$topsis_score = overall
data_analysis$ID = cust_data %>% filter(cluster_label==i) %>% select(ID)

# Top 10 customers for cluster 1
data_analysis %>% arrange(desc(topsis_score)) %>% as_tibble() %>% head(10) %>%  relocate(ID, topsis_score) %>% kable("html", caption = "customer ranking according to TOPSIS scores") %>% kable_styling("striped") %>% scroll_box(width = "100%")
customer ranking according to TOPSIS scores
ID topsis_score MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Response
1763 0.5403485 1259 172 815 97 148 33 1 7 11 10 4 1 0 1 1 0 1
4580 0.5394507 1394 22 708 89 91 182 1 9 7 9 5 1 0 1 1 0 1
9010 0.5309458 968 147 842 137 42 210 1 5 7 10 2 1 0 1 1 0 1
477 0.5216004 1060 61 835 80 20 101 1 4 7 10 1 1 0 1 1 1 1
9058 0.5212907 1060 21 530 32 0 224 1 5 11 5 3 1 0 1 1 0 1
5758 0.5203677 1074 0 69 0 0 46 1 10 4 13 6 1 0 1 1 1 1
7832 0.5181207 940 44 396 0 88 58 1 8 7 7 4 1 0 1 1 0 1
7872 0.5130449 179 21 273 0 21 63 1 6 10 6 5 1 0 1 1 0 1
3520 0.5102894 162 28 818 0 28 56 0 4 3 7 3 1 0 1 1 1 1
5538 0.5093125 897 161 430 186 161 27 0 4 7 6 1 1 0 1 1 0 1
##                _                           
## platform       x86_64-w64-mingw32          
## arch           x86_64                      
## os             mingw32                     
## system         x86_64, mingw32             
## status                                     
## major          4                           
## minor          1.2                         
## year           2021                        
## month          11                          
## day            01                          
## svn rev        81115                       
## language       R                           
## version.string R version 4.1.2 (2021-11-01)
## nickname       Bird Hippie