Load library packages
library(tidyverse)## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(e1071) # for skewness calculation
require(cluster) # pam algorithm## Loading required package: cluster
library(fpc) # cluster evaluation metrics
library(mdw) # entropy-based feature weights
library(MCDA) # TOPSIS
library(FactoMineR) # Factor analysis for mixed data (FAMD)
library(factoextra) ## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(knitr) # For beatiful table display
library(car) # For interactive 3D scatter plot## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(kableExtra)##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
Load the previously saved data
# set the working directory
setwd("~/ml projects/customer segmentation")
cust_data = readRDS("preprocessed_cust_data.rds")Cluster analysis
Dissimilarity matrix computation using gower distance measure
For mixed data types daisy() function from cluster package is used to calculate the pairwise dissimilarity between observations. type argument in the daisy function is used to specify the types of variables in the input cust_data_without_ID. More info regarding the daisy function can be obtained from the help() function.
# Cluster analysis with k-medoids (PAM) algorithm
idx_col_retain = !(colnames(cust_data) %in% c("ID", "min_num_household", "tot_AcceptedCmp"))
cust_data_without_ID = cust_data[,idx_col_retain]
# Boolean attributes
seq_binary = c("Complain", "Response", "Accepted")
idx_binary = sapply(seq_binary,
function(x) grep(x, colnames(cust_data_without_ID)))
idx_binary = unlist(idx_binary)
# continuos attributes
cont_features_patterns = c("Mnt", "Num", "Income","Recency", "age",
"days_enroll")
idx_cont = sapply(cont_features_patterns,
function(x) grep(x, colnames(cust_data_without_ID)))
idx_cont = unlist(idx_cont)
skewness_col = apply(cust_data_without_ID[, idx_cont], 2, skewness)
idx_logtrans = idx_cont[which(abs(skewness_col)>1)]
# Ordinal attributes
dissimilarity_matrix = daisy(cust_data_without_ID, metric = "gower",
type = list(ordratio=grep("home", colnames(cust_data)),
asymm = idx_binary,
logratio = idx_logtrans))K-medoid clustering (PAM algorithm)
Selection of number of clusters, k using cluster evaluation measures
Cluster evaluation can be divided into 3 categories:
- Internal: Typical objective functions in clustering formalize the goal of attaining high intra-cluster similarity (data within a cluster are similar) and low inter-cluster similarity (data from different clusters are dissimilar). The similarity Numerical measure of how alike two data instances are normally quantified by distance measures (Euclidean distance).
- External: Ground truth or gold standard should be available to evaluate how well the clustering results match with the ground truth labels.
I will be using internal cluster evaluation metrics found in fpc package, such as elbow method, average silhouette width, Calinski-Harabasz index and Dunn index to find the optimal number of cluster.
# Number of clusters are suspected to be in the range of 2 to 8.
k_array = seq(from = 2, to = 8, by=1)
cluster_eval_df = data.frame(matrix(, nrow = length(k_array), ncol = 4))
colnames(cluster_eval_df) = c("silhouette", "CH_index", "Dunn_index", "wc_SOS")
cluster_eval_df$k = k_array
for (i in (1:length(k_array))){
set.seed(i+100)
kmedoid = pam(dissimilarity_matrix, k = k_array[i], diss = TRUE,
nstart = 10)
# set diss to TRUE, and set the number of random start as 10.
clust_stat = cluster.stats(dissimilarity_matrix,
clustering = kmedoid$clustering)
cluster_eval_df[i,"silhouette"] = clust_stat$avg.silwidth
cluster_eval_df[i, "CH_index"] = clust_stat$ch
cluster_eval_df[i, "Dunn_index"] = clust_stat$dunn
# Add in the within cluster sum of squares
cluster_eval_df[i, "wc_SOS"] = clust_stat$within.cluster.ss
}
# Line plot for all evaluation metrics (internal cluster evaluation)
cluster_eval_df %>%
pivot_longer(!k, names_to = "cluster_eval", values_to = "value") %>%
ggplot(aes(x = k, y = value)) +
geom_line() + geom_point() + facet_grid(rows = vars(cluster_eval),
scales = "free_y")# find the best_k from stackoverflow using the mode function found on
# https://stackoverflow.com/questions/2547402/how-to-find-the-statistical-mode?page=1&tab=scoredesc#tab-top
mode = function(x){
ux = unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
best_k = mode(c(which.max(cluster_eval_df$silhouette),
which.max(cluster_eval_df$CH_index),
which.max(cluster_eval_df$Dunn_index))) + 1Final clustering model
set.seed(100 + best_k - 1)
kmedoid = pam(dissimilarity_matrix, k = best_k, diss = TRUE, nstart = 10)
# Save the cluster label in the dataframe. Change it to factor to facilitate data wrangling.
cust_data$cluster_label = as.factor(kmedoid$clustering)Cluster analysis
Empirical analysis (Statistical summaries) of each cluster
#1: Number of observations (customers)
cust_data %>% group_by(cluster_label) %>%
summarise(n = n()) %>% kable(caption = "Number of observations in each cluster")| cluster_label | n |
|---|---|
| 1 | 739 |
| 2 | 833 |
| 3 | 662 |
#2: Average of numerical features for each cluster
summary_cont_features_per_cluster =
cust_data %>% group_by(cluster_label) %>%
summarise_if(is.numeric, mean) %>% select(-ID)
kable(summary_cont_features_per_cluster, "html", caption = "mean attributes for each cluster") %>% kable_styling("striped") %>% scroll_box(width = "100%")| cluster_label | Income | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | min_num_household | tot_AcceptedCmp | age | days_enroll |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 67575.36 | 49.68200 | 484.30311 | 50.635995 | 321.79838 | 71.07713 | 51.048715 | 72.24357 | 1.807848 | 5.185386 | 4.5575101 | 7.801082 | 3.746955 | 2.069012 | 0.6968877 | 46.78214 | 362.2679 |
| 2 | 33797.88 | 47.93758 | 73.59784 | 7.223289 | 39.27131 | 10.43938 | 8.028811 | 21.37815 | 2.500600 | 2.566627 | 0.7394958 | 3.480192 | 6.830732 | 2.926771 | 0.2292917 | 40.00840 | 350.3902 |
| 3 | 55723.21 | 50.10725 | 393.01813 | 23.126888 | 155.21601 | 34.29154 | 24.370091 | 40.88520 | 2.676737 | 4.767372 | 2.9743202 | 6.469788 | 5.163142 | 2.768882 | 0.4425982 | 49.64804 | 354.4955 |
# can use list() for many variables
#3: Distribution of categorical features for each cluster
cate_features_names = names(Filter(is.factor, cust_data))
# Change the second argument of group_by() and third argument aes() to desired categorical feature names: Education, Marital_status, Kidhome, Teenhome.
cust_data %>% select(one_of(cate_features_names)) %>%
group_by(cluster_label, Teenhome) %>% summarise(n = n()) %>%
mutate(prop = (n/sum(n))) %>%
ggplot(aes(x = cluster_label, y = prop, fill = Teenhome)) +
geom_bar(stat = "identity", position = position_dodge()) +
geom_text(aes(label = round(prop, 2)), vjust = 1.5, size = 2,
position = position_dodge(0.9)) +
theme_minimal()## `summarise()` has grouped output by 'cluster_label'. You can override using the
## `.groups` argument.
#4: Distribution of minimum number of household members for each cluster
cust_data$min_num_household = factor(cust_data$min_num_household, ordered = TRUE)
cust_data %>% group_by(cluster_label, min_num_household) %>% summarise(n = n()) %>%
mutate(prop = (n/sum(n))) %>%
ggplot(aes(x = cluster_label, y = prop, fill = min_num_household)) +
geom_bar(stat = "identity", position = position_dodge()) +
geom_text(aes(label = round(prop, 2)), vjust = 1.5, size = 2,
position = position_dodge(0.9)) +
theme_minimal()## `summarise()` has grouped output by 'cluster_label'. You can override using the
## `.groups` argument.
#5: Tabulate binary(Boolean) variable for each cluster
table1 =
cust_data %>%
group_by(cluster_label, Complain) %>% summarise(n = n()) %>%
mutate(prop = (n/sum(n))) %>% select(-n)## `summarise()` has grouped output by 'cluster_label'. You can override using the
## `.groups` argument.
colnames(table1)[2] = "Binary_outcomes"
#pattern = c("Complain", "Response", "Accept", "cluster")
#idx_select = lapply(pattern, function (x) grep(x, colnames(cust_data)))
#idx = unlist(idx_select)
cust_data_1 = cust_data %>%
select(contains("Response") | contains("Accept") | contains("cluster"))
n_col = ncol(cust_data_1)
for (i in (1:(n_col-2))){
table2 = cust_data_1 %>% select(c(n_col, all_of(i))) %>%
group_by_all() %>%
summarise(n = n()) %>%
mutate(prop = (n/sum(n))) %>%
select(-n)
colnames(table2)[2] = "Binary_outcomes"
table1 = table1 %>%
left_join(table2, by = c("cluster_label" = "cluster_label",
"Binary_outcomes" = "Binary_outcomes"))
}## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(n_col)` instead of `n_col` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## `summarise()` has grouped output by 'cluster_label'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'cluster_label'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'cluster_label'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'cluster_label'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'cluster_label'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'cluster_label'. You can override using the `.groups` argument.
oldname = colnames(table1)[3:length(colnames(table1))]
newname = c("Complain",
colnames(cust_data_1)[-length(colnames(cust_data_1))])
# Rename
for (i in (1:length(newname))){
names(table1)[names(table1) == oldname[i]] = newname[i]
}
table1 %>% kable("html", caption = "Proportions of Boolean attributes of each cluster") %>% kable_styling("striped") %>% scroll_box(width = "100%")| cluster_label | Binary_outcomes | Complain | Response | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 |
|---|---|---|---|---|---|---|---|---|
| 1 | 0 | 0.9932341 | 0.8159675 | 0.9269283 | 0.8849797 | 0.8349120 | 0.8619756 | 0.9783491 |
| 1 | 1 | 0.0067659 | 0.1840325 | 0.0730717 | 0.1150203 | 0.1650880 | 0.1380244 | 0.0216509 |
| 2 | 0 | 0.9867947 | 0.8835534 | 0.9207683 | 0.9819928 | 0.9951981 | 0.9891957 | 1.0000000 |
| 2 | 1 | 0.0132053 | 0.1164466 | 0.0792317 | 0.0180072 | 0.0048019 | 0.0108043 | NA |
| 3 | 0 | 0.9939577 | 0.8489426 | 0.9350453 | 0.8987915 | 0.9456193 | 0.9501511 | 0.9788520 |
| 3 | 1 | 0.0060423 | 0.1510574 | 0.0649547 | 0.1012085 | 0.0543807 | 0.0498489 | 0.0211480 |
Principal component analysis (PCA) for mixed data
Visualize the data in lower dimensional space.
res.famd = FAMD(cust_data_without_ID, graph = F)
eig.value = get_eigenvalue(res.famd)
head(eig.value)## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 6.743189 18.731080 18.73108
## Dim.2 2.205379 6.126053 24.85713
## Dim.3 1.939483 5.387452 30.24458
## Dim.4 1.688255 4.689596 34.93418
## Dim.5 1.231814 3.421705 38.35589
fviz_screeplot(res.famd)# predict to get the transformed feature space and plot on 3 dimensional scatter plot
transformed_data = predict(res.famd, cust_data_without_ID)The code snippet below is in reference to a stackoverflow post
options(rgl.useNULL = TRUE)
library(rgl)## Warning: package 'rgl' was built under R version 4.1.3
scatter3d(x = transformed_data$coord[,1], y = transformed_data$coord[,2],
z = transformed_data$coord[,3],
groups = cust_data$cluster_label, grid = FALSE,
surface = FALSE, ellipsoid = TRUE,
surface.col = c("#80FF00", "#009999", "#FF007F"))## Loading required namespace: mgcv
rglwidget()**A pop-up window will appear in Rstudio if you run the above code chunk and you can freely rotate the interactive 3D scatter plot.
Customer ranking
TOPSIS
To rank alternatives for each cluster based on features: Mnt, Num, Accepted and Response.
Entropy-based feature weighting
# Calculate feature weights for continuous and categorical features for first cluster
i = 1
data_analysis_cont = cust_data %>% filter(cluster_label==i) %>% dplyr::select(starts_with(c("Mnt", "Num")))
data_analysis_cate = cust_data %>% filter(cluster_label==i) %>%
dplyr::select(starts_with(c("Accepted", "Response")))
h_cont = get.bw(scale(data_analysis_cont), bw = "nrd", nb = 'na')
w_cont = entropy.weight(scale(data_analysis_cont), h = h_cont)
w_cate = entropy.weight(data_analysis_cate, h='na')Customer ranking
Customer ranking for 1st cluster.
data_analysis_cate <- lapply(data_analysis_cate,
function(x) as.numeric(as.character(x)))
data_analysis = cbind(data_analysis_cont, data_analysis_cate)
feat_imp = c(w_cont, w_cate)
overall = TOPSIS(data_analysis,
feat_imp,
criteriaMinMax = rep("max", length(feat_imp)))
data_analysis$topsis_score = overall
data_analysis$ID = cust_data %>% filter(cluster_label==i) %>% select(ID)
# Top 10 customers for cluster 1
data_analysis %>% arrange(desc(topsis_score)) %>% as_tibble() %>% head(10) %>% relocate(ID, topsis_score) %>% kable("html", caption = "customer ranking according to TOPSIS scores") %>% kable_styling("striped") %>% scroll_box(width = "100%")| ID | topsis_score | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Response |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1763 | 0.5403485 | 1259 | 172 | 815 | 97 | 148 | 33 | 1 | 7 | 11 | 10 | 4 | 1 | 0 | 1 | 1 | 0 | 1 |
| 4580 | 0.5394507 | 1394 | 22 | 708 | 89 | 91 | 182 | 1 | 9 | 7 | 9 | 5 | 1 | 0 | 1 | 1 | 0 | 1 |
| 9010 | 0.5309458 | 968 | 147 | 842 | 137 | 42 | 210 | 1 | 5 | 7 | 10 | 2 | 1 | 0 | 1 | 1 | 0 | 1 |
| 477 | 0.5216004 | 1060 | 61 | 835 | 80 | 20 | 101 | 1 | 4 | 7 | 10 | 1 | 1 | 0 | 1 | 1 | 1 | 1 |
| 9058 | 0.5212907 | 1060 | 21 | 530 | 32 | 0 | 224 | 1 | 5 | 11 | 5 | 3 | 1 | 0 | 1 | 1 | 0 | 1 |
| 5758 | 0.5203677 | 1074 | 0 | 69 | 0 | 0 | 46 | 1 | 10 | 4 | 13 | 6 | 1 | 0 | 1 | 1 | 1 | 1 |
| 7832 | 0.5181207 | 940 | 44 | 396 | 0 | 88 | 58 | 1 | 8 | 7 | 7 | 4 | 1 | 0 | 1 | 1 | 0 | 1 |
| 7872 | 0.5130449 | 179 | 21 | 273 | 0 | 21 | 63 | 1 | 6 | 10 | 6 | 5 | 1 | 0 | 1 | 1 | 0 | 1 |
| 3520 | 0.5102894 | 162 | 28 | 818 | 0 | 28 | 56 | 0 | 4 | 3 | 7 | 3 | 1 | 0 | 1 | 1 | 1 | 1 |
| 5538 | 0.5093125 | 897 | 161 | 430 | 186 | 161 | 27 | 0 | 4 | 7 | 6 | 1 | 1 | 0 | 1 | 1 | 0 | 1 |
## _
## platform x86_64-w64-mingw32
## arch x86_64
## os mingw32
## system x86_64, mingw32
## status
## major 4
## minor 1.2
## year 2021
## month 11
## day 01
## svn rev 81115
## language R
## version.string R version 4.1.2 (2021-11-01)
## nickname Bird Hippie