Load library packages
Load the previously saved data
Cluster analysis
Customer ranking
- TOPSIS
  - Entropy-based feature weighting
  - Customer ranking

Load library packages

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(e1071)   # for skewness calculation
require(cluster)  # pam algorithm

## Loading required package: cluster

library(fpc)      # cluster evaluation metrics
library(mdw)      # entropy-based feature weights
library(MCDA)      # TOPSIS
library(FactoMineR)  # Factor analysis for mixed data (FAMD)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(knitr)       # For beatiful table display
library(car)         # For interactive 3D scatter plot

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

Load the previously saved data

# set the working directory
setwd("~/ml projects/customer segmentation")
cust_data = readRDS("preprocessed_cust_data.rds")

Cluster analysis

Dissimilarity matrix computation using gower distance measure

For mixed data types daisy() function from cluster package is used to calculate the pairwise dissimilarity between observations. type argument in the daisy function is used to specify the types of variables in the input cust_data_without_ID. More info regarding the daisy function can be obtained from the help() function.

# Cluster analysis with k-medoids (PAM) algorithm
idx_col_retain = !(colnames(cust_data) %in% c("ID", "min_num_household", "tot_AcceptedCmp"))
cust_data_without_ID = cust_data[,idx_col_retain]

# Boolean attributes
seq_binary = c("Complain", "Response", "Accepted")
idx_binary = sapply(seq_binary, 
                    function(x) grep(x, colnames(cust_data_without_ID)))
idx_binary = unlist(idx_binary)

# continuos attributes
cont_features_patterns = c("Mnt", "Num", "Income","Recency", "age", 
                           "days_enroll") 
idx_cont = sapply(cont_features_patterns, 
                  function(x) grep(x, colnames(cust_data_without_ID)))
idx_cont = unlist(idx_cont)

skewness_col = apply(cust_data_without_ID[, idx_cont], 2, skewness)
idx_logtrans = idx_cont[which(abs(skewness_col)>1)]

# Ordinal attributes

dissimilarity_matrix = daisy(cust_data_without_ID, metric = "gower", 
                             type = list(ordratio=grep("home", colnames(cust_data)),
                                         asymm = idx_binary,
                                         logratio = idx_logtrans))

K-medoid clustering (PAM algorithm)

Selection of number of clusters, k using cluster evaluation measures

Cluster evaluation can be divided into 3 categories:

Internal: Typical objective functions in clustering formalize the goal of attaining high intra-cluster similarity (data within a cluster are similar) and low inter-cluster similarity (data from different clusters are dissimilar). The similarity Numerical measure of how alike two data instances are normally quantified by distance measures (Euclidean distance).
External: Ground truth or gold standard should be available to evaluate how well the clustering results match with the ground truth labels.

I will be using internal cluster evaluation metrics found in fpc package, such as elbow method, average silhouette width, Calinski-Harabasz index and Dunn index to find the optimal number of cluster.

# Number of clusters are suspected to be in the range of 2 to 8. 
k_array = seq(from = 2, to = 8, by=1)
cluster_eval_df = data.frame(matrix(, nrow = length(k_array), ncol = 4))
colnames(cluster_eval_df) = c("silhouette", "CH_index", "Dunn_index", "wc_SOS")
cluster_eval_df$k = k_array

for (i in (1:length(k_array))){
  set.seed(i+100)
  kmedoid = pam(dissimilarity_matrix, k = k_array[i], diss = TRUE,
                nstart = 10)
  # set diss to TRUE, and set the number of random start as 10.
  clust_stat = cluster.stats(dissimilarity_matrix, 
                             clustering = kmedoid$clustering)
  cluster_eval_df[i,"silhouette"] = clust_stat$avg.silwidth
  cluster_eval_df[i, "CH_index"] = clust_stat$ch
  cluster_eval_df[i, "Dunn_index"] = clust_stat$dunn
  # Add in the within cluster sum of squares
  cluster_eval_df[i, "wc_SOS"] = clust_stat$within.cluster.ss
}

# Line plot for all evaluation metrics (internal cluster evaluation)
cluster_eval_df %>% 
  pivot_longer(!k, names_to = "cluster_eval", values_to = "value") %>% 
  ggplot(aes(x = k, y = value)) +
  geom_line() + geom_point() + facet_grid(rows = vars(cluster_eval), 
                                          scales = "free_y")

# find the best_k from stackoverflow using the mode function found on
# https://stackoverflow.com/questions/2547402/how-to-find-the-statistical-mode?page=1&tab=scoredesc#tab-top
mode = function(x){
  ux = unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}
best_k = mode(c(which.max(cluster_eval_df$silhouette), 
                which.max(cluster_eval_df$CH_index),
                which.max(cluster_eval_df$Dunn_index))) + 1

Final clustering model

set.seed(100 + best_k - 1)
kmedoid = pam(dissimilarity_matrix, k = best_k, diss = TRUE, nstart = 10)
# Save the cluster label in the dataframe. Change it to factor to facilitate data wrangling.
cust_data$cluster_label = as.factor(kmedoid$clustering)

Cluster analysis

Empirical analysis (Statistical summaries) of each cluster

#1: Number of observations (customers)
cust_data %>% group_by(cluster_label) %>% 
  summarise(n = n()) %>% kable(caption = "Number of observations in each cluster")

Number of observations in each cluster
cluster_label	n
1	739
2	833
3	662

#2: Average of numerical features for each cluster
summary_cont_features_per_cluster = 
  cust_data %>% group_by(cluster_label) %>% 
  summarise_if(is.numeric, mean) %>% select(-ID)
kable(summary_cont_features_per_cluster, "html", caption = "mean attributes for each cluster") %>% kable_styling("striped") %>% scroll_box(width = "100%")

mean attributes for each cluster
cluster_label	Income	Recency	MntWines	MntFruits	MntMeatProducts	MntFishProducts	MntSweetProducts	MntGoldProds	NumDealsPurchases	NumWebPurchases	NumCatalogPurchases	NumStorePurchases	NumWebVisitsMonth	min_num_household	tot_AcceptedCmp	age	days_enroll
1	67575.36	49.68200	484.30311	50.635995	321.79838	71.07713	51.048715	72.24357	1.807848	5.185386	4.5575101	7.801082	3.746955	2.069012	0.6968877	46.78214	362.2679
2	33797.88	47.93758	73.59784	7.223289	39.27131	10.43938	8.028811	21.37815	2.500600	2.566627	0.7394958	3.480192	6.830732	2.926771	0.2292917	40.00840	350.3902
3	55723.21	50.10725	393.01813	23.126888	155.21601	34.29154	24.370091	40.88520	2.676737	4.767372	2.9743202	6.469788	5.163142	2.768882	0.4425982	49.64804	354.4955

# can use list() for many variables 


#3: Distribution of categorical features for each cluster
cate_features_names = names(Filter(is.factor, cust_data))
# Change the second argument of group_by() and third argument aes() to   desired categorical feature names: Education, Marital_status, Kidhome,   Teenhome.
cust_data %>% select(one_of(cate_features_names)) %>% 
  group_by(cluster_label, Teenhome) %>% summarise(n = n()) %>% 
  mutate(prop = (n/sum(n))) %>% 
  ggplot(aes(x = cluster_label, y = prop, fill = Teenhome)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  geom_text(aes(label = round(prop, 2)), vjust = 1.5, size = 2,
            position = position_dodge(0.9)) +
  theme_minimal()

## `summarise()` has grouped output by 'cluster_label'. You can override using the
## `.groups` argument.

#4: Distribution of minimum number of household members for each cluster
cust_data$min_num_household = factor(cust_data$min_num_household, ordered = TRUE)
cust_data %>% group_by(cluster_label, min_num_household) %>% summarise(n = n()) %>% 
  mutate(prop = (n/sum(n))) %>% 
  ggplot(aes(x = cluster_label, y = prop, fill = min_num_household)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  geom_text(aes(label = round(prop, 2)), vjust = 1.5, size = 2,
            position = position_dodge(0.9)) +
  theme_minimal()

## `summarise()` has grouped output by 'cluster_label'. You can override using the
## `.groups` argument.

#5: Tabulate binary(Boolean) variable for each cluster
table1 = 
  cust_data %>% 
  group_by(cluster_label, Complain) %>% summarise(n = n()) %>% 
  mutate(prop = (n/sum(n))) %>% select(-n)

## `summarise()` has grouped output by 'cluster_label'. You can override using the
## `.groups` argument.

colnames(table1)[2] = "Binary_outcomes"

#pattern = c("Complain", "Response", "Accept", "cluster")
#idx_select = lapply(pattern, function (x) grep(x, colnames(cust_data)))
#idx = unlist(idx_select)
cust_data_1 = cust_data %>% 
  select(contains("Response") | contains("Accept") | contains("cluster"))

n_col = ncol(cust_data_1)
for (i in (1:(n_col-2))){
  table2 = cust_data_1 %>%  select(c(n_col, all_of(i))) %>% 
    group_by_all() %>% 
    summarise(n = n()) %>% 
    mutate(prop = (n/sum(n))) %>% 
    select(-n)
  colnames(table2)[2] = "Binary_outcomes"
  table1 = table1 %>% 
    left_join(table2, by = c("cluster_label" = "cluster_label", 
                             "Binary_outcomes" = "Binary_outcomes"))
}

## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(n_col)` instead of `n_col` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## `summarise()` has grouped output by 'cluster_label'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'cluster_label'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'cluster_label'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'cluster_label'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'cluster_label'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'cluster_label'. You can override using the `.groups` argument.

oldname = colnames(table1)[3:length(colnames(table1))]
newname = c("Complain", 
            colnames(cust_data_1)[-length(colnames(cust_data_1))])
# Rename
for (i in (1:length(newname))){
  names(table1)[names(table1) == oldname[i]] = newname[i]
}
table1 %>% kable("html", caption = "Proportions of Boolean attributes of each cluster") %>% kable_styling("striped") %>% scroll_box(width = "100%")

Proportions of Boolean attributes of each cluster
cluster_label	Binary_outcomes	Complain	Response	AcceptedCmp3	AcceptedCmp4	AcceptedCmp5	AcceptedCmp1	AcceptedCmp2
1	0	0.9932341	0.8159675	0.9269283	0.8849797	0.8349120	0.8619756	0.9783491
1	1	0.0067659	0.1840325	0.0730717	0.1150203	0.1650880	0.1380244	0.0216509
2	0	0.9867947	0.8835534	0.9207683	0.9819928	0.9951981	0.9891957	1.0000000
2	1	0.0132053	0.1164466	0.0792317	0.0180072	0.0048019	0.0108043	NA
3	0	0.9939577	0.8489426	0.9350453	0.8987915	0.9456193	0.9501511	0.9788520
3	1	0.0060423	0.1510574	0.0649547	0.1012085	0.0543807	0.0498489	0.0211480

Principal component analysis (PCA) for mixed data

Visualize the data in lower dimensional space.

res.famd = FAMD(cust_data_without_ID, graph = F)

eig.value = get_eigenvalue(res.famd)
head(eig.value)

##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1   6.743189        18.731080                    18.73108
## Dim.2   2.205379         6.126053                    24.85713
## Dim.3   1.939483         5.387452                    30.24458
## Dim.4   1.688255         4.689596                    34.93418
## Dim.5   1.231814         3.421705                    38.35589

fviz_screeplot(res.famd)

# predict to get the transformed feature space and plot on 3 dimensional scatter plot
transformed_data = predict(res.famd, cust_data_without_ID)

The code snippet below is in reference to a stackoverflow post

options(rgl.useNULL = TRUE)
library(rgl)

## Warning: package 'rgl' was built under R version 4.1.3

scatter3d(x = transformed_data$coord[,1], y = transformed_data$coord[,2],
          z = transformed_data$coord[,3], 
          groups = cust_data$cluster_label, grid = FALSE, 
          surface = FALSE, ellipsoid = TRUE, 
          surface.col = c("#80FF00", "#009999", "#FF007F"))

## Loading required namespace: mgcv

rglwidget()

**A pop-up window will appear in Rstudio if you run the above code chunk and you can freely rotate the interactive 3D scatter plot.

Customer ranking

TOPSIS

To rank alternatives for each cluster based on features: Mnt, Num, Accepted and Response.

Entropy-based feature weighting

# Calculate feature weights for continuous and categorical features for first cluster
i = 1
data_analysis_cont = cust_data %>% filter(cluster_label==i) %>% dplyr::select(starts_with(c("Mnt", "Num")))
data_analysis_cate = cust_data %>% filter(cluster_label==i) %>% 
  dplyr::select(starts_with(c("Accepted", "Response")))

h_cont = get.bw(scale(data_analysis_cont), bw = "nrd", nb = 'na')
w_cont = entropy.weight(scale(data_analysis_cont), h = h_cont)
w_cate = entropy.weight(data_analysis_cate, h='na')

Customer ranking

Customer ranking for 1^st cluster.

data_analysis_cate <- lapply(data_analysis_cate, 
                             function(x) as.numeric(as.character(x)))
data_analysis = cbind(data_analysis_cont, data_analysis_cate)

feat_imp = c(w_cont, w_cate)
overall = TOPSIS(data_analysis, 
                 feat_imp, 
                 criteriaMinMax = rep("max", length(feat_imp)))
data_analysis$topsis_score = overall
data_analysis$ID = cust_data %>% filter(cluster_label==i) %>% select(ID)

# Top 10 customers for cluster 1
data_analysis %>% arrange(desc(topsis_score)) %>% as_tibble() %>% head(10) %>%  relocate(ID, topsis_score) %>% kable("html", caption = "customer ranking according to TOPSIS scores") %>% kable_styling("striped") %>% scroll_box(width = "100%")

customer ranking according to TOPSIS scores
ID	topsis_score	MntWines	MntFruits	MntMeatProducts	MntFishProducts	MntSweetProducts	MntGoldProds	NumDealsPurchases	NumWebPurchases	NumCatalogPurchases	NumStorePurchases	NumWebVisitsMonth	AcceptedCmp3	AcceptedCmp5	AcceptedCmp1	AcceptedCmp2	Response
1763	0.5403485	1259	172	815	97	148	33	1	7	11	10	4	1	1	1	0	1
4580	0.5394507	1394	22	708	89	91	182	1	9	7	9	5	1	1	1	0	1
9010	0.5309458	968	147	842	137	42	210	1	5	7	10	2	1	1	1	0	1
477	0.5216004	1060	61	835	80	20	101	1	4	7	10	1	1	1	1	1	1
9058	0.5212907	1060	21	530	32	0	224	1	5	11	5	3	1	1	1	0	1
5758	0.5203677	1074	0	69	0	0	46	1	10	4	13	6	1	1	1	1	1
7832	0.5181207	940	44	396	0	88	58	1	8	7	7	4	1	1	1	0	1
7872	0.5130449	179	21	273	0	21	63	1	6	10	6	5	1	1	1	0	1
3520	0.5102894	162	28	818	0	28	56	0	4	3	7	3	1	1	1	1	1
5538	0.5093125	897	161	430	186	161	27	0	4	7	6	1	1	1	1	0	1

##                _                           
## platform       x86_64-w64-mingw32          
## arch           x86_64                      
## os             mingw32                     
## system         x86_64, mingw32             
## status                                     
## major          4                           
## minor          1.2                         
## year           2021                        
## month          11                          
## day            01                          
## svn rev        81115                       
## language       R                           
## version.string R version 4.1.2 (2021-11-01)
## nickname       Bird Hippie

Customer personality analysis: Cluster analysis and customer ranking

Lim Jia Qi

5/24/2022

Load library packages

Load the previously saved data

Cluster analysis

Dissimilarity matrix computation using gower distance measure

K-medoid clustering (PAM algorithm)

Selection of number of clusters, k using cluster evaluation measures

Final clustering model

Cluster analysis

Empirical analysis (Statistical summaries) of each cluster

Principal component analysis (PCA) for mixed data

Customer ranking

TOPSIS

Entropy-based feature weighting

Customer ranking