#Load the packages
packages <- readLines("requirements.txt")
lapply(packages, library, character.only = TRUE)
[[1]]
[1] "NbClust" "hopkins" "fastcluster" "cluster" "factoextra" "modelsummary" "lubridate" "forcats" "stringr"
[10] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "dplyr" "magrittr" "stats"
[19] "graphics" "grDevices" "utils" "datasets" "methods" "base"
[[2]]
[1] "NbClust" "hopkins" "fastcluster" "cluster" "factoextra" "modelsummary" "lubridate" "forcats" "stringr"
[10] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "dplyr" "magrittr" "stats"
[19] "graphics" "grDevices" "utils" "datasets" "methods" "base"
[[3]]
[1] "NbClust" "hopkins" "fastcluster" "cluster" "factoextra" "modelsummary" "lubridate" "forcats" "stringr"
[10] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "dplyr" "magrittr" "stats"
[19] "graphics" "grDevices" "utils" "datasets" "methods" "base"
[[4]]
[1] "NbClust" "hopkins" "fastcluster" "cluster" "factoextra" "modelsummary" "lubridate" "forcats" "stringr"
[10] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "dplyr" "magrittr" "stats"
[19] "graphics" "grDevices" "utils" "datasets" "methods" "base"
[[5]]
[1] "NbClust" "hopkins" "fastcluster" "cluster" "factoextra" "modelsummary" "lubridate" "forcats" "stringr"
[10] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "dplyr" "magrittr" "stats"
[19] "graphics" "grDevices" "utils" "datasets" "methods" "base"
[[6]]
[1] "NbClust" "hopkins" "fastcluster" "cluster" "factoextra" "modelsummary" "lubridate" "forcats" "stringr"
[10] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "dplyr" "magrittr" "stats"
[19] "graphics" "grDevices" "utils" "datasets" "methods" "base"
[[7]]
[1] "NbClust" "hopkins" "fastcluster" "cluster" "factoextra" "modelsummary" "lubridate" "forcats" "stringr"
[10] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "dplyr" "magrittr" "stats"
[19] "graphics" "grDevices" "utils" "datasets" "methods" "base"
[[8]]
[1] "NbClust" "hopkins" "fastcluster" "cluster" "factoextra" "modelsummary" "lubridate" "forcats" "stringr"
[10] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "dplyr" "magrittr" "stats"
[19] "graphics" "grDevices" "utils" "datasets" "methods" "base"
[[9]]
[1] "NbClust" "hopkins" "fastcluster" "cluster" "factoextra" "modelsummary" "lubridate" "forcats" "stringr"
[10] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "dplyr" "magrittr" "stats"
[19] "graphics" "grDevices" "utils" "datasets" "methods" "base"
[[10]]
[1] "NbClust" "hopkins" "fastcluster" "cluster" "factoextra" "modelsummary" "lubridate" "forcats" "stringr"
[10] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "dplyr" "magrittr" "stats"
[19] "graphics" "grDevices" "utils" "datasets" "methods" "base"
[[11]]
[1] "NbClust" "hopkins" "fastcluster" "cluster" "factoextra" "modelsummary" "lubridate" "forcats" "stringr"
[10] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "dplyr" "magrittr" "stats"
[19] "graphics" "grDevices" "utils" "datasets" "methods" "base"
As a starting point of the customer segmentation, I will pick out only the behavioural attributes from the ‘raw’ dataframe. This new dataframe will be called behavioural_data and all the columns must be scaled. Considerint, that there are only numeric values in all columns, I need to transform them with a scaling function. The data will be fit within a specific range/distribution to improve the performance of the model.
behavioral_data<-raw %>% data.frame() %>%
select(InternetTrafficVolume,
MortageVolume,
AccountSpending,
CreditCardSpending,
HelpHotlineTime,
CustomerSince,
GrocerySpending,
StockVolume,
CreditVolume,
NASDAQInvest,
USAXSFundInvest,
BranchVisits,
AppLogins,
ATMVisits,
TimeOnlineBanking,
ServiceFees,
SocialMediaInter,
Bitcoins,
NFTs)
head(behavioral_data)
set.seed(123) #Insert the same seed to get the same scaled numerical values
behavioral_data_scaled<-scale(behavioral_data)
head(behavioral_data_scaled)
InternetTrafficVolume MortageVolume AccountSpending CreditCardSpending HelpHotlineTime CustomerSince GrocerySpending StockVolume
[1,] -0.2889997 1.6471061 -0.3450280 0.8636732 -0.8632445 0.7723802 -0.34104672 -0.6842514
[2,] -0.9537126 1.2700695 -0.1509407 -0.6818782 -0.8427150 0.7723802 0.19341832 -0.5016177
[3,] -0.3192140 0.5749801 -0.3528008 0.5714726 -0.9302276 0.7723802 0.12646708 -0.6849314
[4,] 0.1037852 1.3859733 -0.2801354 0.2591261 -0.6214784 0.7723802 0.08311691 -0.5265523
[5,] -0.8630700 1.0690862 -0.1072321 0.2034089 -0.6937764 0.8184972 -0.75375557 -0.7384732
[6,] -0.4702851 0.9330313 -0.3513856 -0.1526760 -0.5917007 0.7723802 0.09847517 -0.3919688
CreditVolume NASDAQInvest USAXSFundInvest BranchVisits AppLogins ATMVisits TimeOnlineBanking ServiceFees SocialMediaInter Bitcoins
[1,] -0.4354132 -0.2353360 -0.3417909 -0.28345927 -1.319352 1.751956 -0.7152344 0.02878044 0.4675133 -0.8542812
[2,] -0.4407973 -0.2240967 -0.3244452 0.02716123 -1.059550 1.321701 -0.7897758 0.36121889 0.3502121 -0.8520388
[3,] -0.4507881 -0.2269506 -0.3549125 -0.28345927 -1.203885 1.321701 -0.9289516 0.44046295 0.5261639 -0.8076386
[4,] -0.4594416 -0.2294082 -0.3270950 0.02716123 -1.203885 1.321701 -0.8941460 0.02794600 0.8780673 -0.8614570
[5,] -0.4396339 -0.2268875 -0.3721992 -0.28345927 -1.146151 1.751956 -0.7894912 0.52092062 0.8780673 -0.8349963
[6,] -0.4472727 -0.2259212 -0.2313141 0.02716123 -1.232751 1.321701 -0.9766564 0.51830595 0.2915616 -0.8551782
NFTs
[1,] -0.4895549
[2,] -0.8611631
[3,] -0.8611631
[4,] -0.8611631
[5,] -1.2327713
[6,] -0.1179467
library(stats)
library(factoextra)
library(cluster)
library(fastcluster)
random_subset<-behavioral_data_scaled[sample(nrow(behavioral_data_scaled),1000), ]
dist.eucl<-get_dist(random_subset, method = "euclid")
round(as.matrix(dist.eucl)[1:3,1:3],1)
1 2 3
1 0.0 4.5 6.8
2 4.5 0.0 7.4
3 6.8 7.4 0.0
fviz_dist(dist.eucl)
hist(dist.eucl)
median(dist.eucl)
[1] 6.148364
I took a random sample of scaled data, because calculating distance between objects is heavy process when running locally.
I used the Euclidean distance method to measure and overview the distribution of a random sample of objects. High values mean high separation between clusters and low value could mean tight cohesion inside clusters. The distribution of the colours on the matrix shows that there are underlying clusters.In order to fully understand those requirements of clusterisation, I will provide additional metrics and scores.
This validation is prerequisite before modelling, so we have a basic idea of what the data looks like when plotted. In simple terms, I will be looking to find ‘clouds’ on the following chart. If there are obvious chunks of data points, that means we have good probability to build good clustering model.
fviz_pca_ind(prcomp(behavioral_data_scaled),
geom = "point", palette = 'jco',
ggtheme = theme_classic(),
title = 'Visual inspection of the data for clusterization')
It becomes clear there are 6 visible groups of data. However, that does not mean that 6 clusters is the optimal number we want to use for the modelling. There are other methods, which will provide better orientation.
This is a statistical hypothesis test, which defines a Null Hypothesis that the data is generated by a Poisson process, consequently the data is evenly distributed. If the value of the Hopkins test is above 0.7 to 1, that means the data has good tendency to be clustered or we can also say that the data is not random and there is connection between segments.
library(hopkins)
set.seed(123)
hopkins_test<-hopkins(behavioral_data_scaled,m = nrow(behavioral_data_scaled)-1)
print(paste0("Hopkins statistic: ", hopkins_test))
[1] "Hopkins statistic: 0.999999999999999"
h_pval<-hopkins.pval(hopkins_test, n = nrow(behavioral_data_scaled)-1)
print(paste0("p-value for statistical significance: ", h_pval))
[1] "p-value for statistical significance: 0"
The data show a high tendency for clustering hopkins_test = 0.99, and Hopkins statistical significance below 0.05, i.e. it is significant.
In this part of the project I will apply 4 different methods when choosing the optimal number of clusters. The main function we use is fviz_nbclust from the factoextra package. The most advanced method that we will use is NbClust() from the NbClust package, which provides the widest scope of clustering validation results. For all of the methods, I used kmeans as the algorithm to compare with.
library(NbClust)
# Elbow method
wss<-fviz_nbclust(behavioral_data_scaled, kmeans, method = "wss") + labs(title = "Optimal number of clusters (Elbow method)")
wss
# Silhouette method
silh<-fviz_nbclust(behavioral_data_scaled, kmeans, method = "silhouette") +labs(title = "Optimal number of clusters (Silhouette method")
silh
# Gap statistics
set.seed(123)
pca_sample<- prcomp(behavioral_data_scaled)
reduced_data<-pca_sample$x[ ,1:3]
gap<-fviz_nbclust(reduced_data, FUNcluster = function(x,k) kmeans(x, centers = k, nstart = 10, iter.max = 100),
method = "gap_stat",
nboot = 20) + labs(title = "Optimal number of clusters (Gap statistics)")
Clustering k = 1,2,..., K.max (= 10): .. done
Bootstrapping, b = 1,2,..., B (= 20) [one "." per sample]:
.................... 20
gap
NbClust is the most advanced method, which allows me to compare withing a range of k (2-8).
!Please note that NbClust is heavy function to process locally on standard PC and it usually takes time!
nb<-NbClust(data = behavioral_data_scaled, distance = "euclidean", min.nc = 2, max.nc = 8, method = "kmeans") #Pick min and max number of clusters.
*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 1 proposed 2 as the best number of clusters
* 3 proposed 3 as the best number of clusters
* 6 proposed 4 as the best number of clusters
* 2 proposed 6 as the best number of clusters
* 9 proposed 7 as the best number of clusters
* 2 proposed 8 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 7
*******************************************************************
barNbclust<-barplot(table(nb$Best.nc[1,]), xlab ="Number of clusters", ylab = "Number of criterias")
barNbclust
[,1]
[1,] 0.7
[2,] 1.9
[3,] 3.1
[4,] 4.3
[5,] 5.5
[6,] 6.7
[7,] 7.9
[8,] 9.1
The optimal number of clusters after running NbClust is 7. Also, with 4 clusters, we can read a good evaluation of the validation method.
The conclusion we can draw after validating the number of clusters is that they vary between 6 and 8. Only the NbClust function shows a good estimate for 4 segments. It is significant to note that both Elbow and Silhouette method have a smoothing in the line at around 5 clusters.
In the next part of the project, I will test out the majority of clustering models and algorithms. The test will be performed with k (number of clusters) around 7. Right below every model, I will analyse the model with validating visualizations and evaluations.