Customer Segmentation Using Machine Learning

Project Scope: This project is to understand and go through the process of division of customer base into several groups that share a similarity in a ways relevant to marketing techniques or spending habits.

Customer segmentation can be significantly important for the companies, using customer segmentation companies can aim to dig a deeper approach to give more personalize recommendations to customers.

Benefits of customer segmentation.
1. Identify least and most profitable aspects of business.
2. Improved marketing plans based on customer needs.
3. Improved customer service.
4. Gain customers loyalty by offering them great deals and discounted prices on the products of their interest.
5. Establish brand identity.

A. Data Exploration.

Customer_segment <- read.csv("C:/data/Customer_segment.csv", header=TRUE) #reading data
str(Customer_segment)

## 'data.frame':    200 obs. of  5 variables:
##  $ CustomerID            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender                : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 1 1 1 2 1 ...
##  $ Age                   : int  19 21 20 23 31 22 35 23 64 30 ...
##  $ Annual.Income..k..    : int  15 15 16 16 17 17 18 18 19 19 ...
##  $ Spending.Score..1.100.: int  39 81 6 77 40 76 6 94 3 72 ...

gender_dist = table(Customer_segment$Gender)
barplot(gender_dist, main = "Barplot to display gender comparison",
        ylab = "count",
        xlab="Gender",
        legend=rownames(gender_dist))

hist(Customer_segment$Age,
    col="Red",
    main="Histogram to Show Count of Age ",
    xlab="Age Class",
    ylab="Frequency",
    labels=TRUE)

plot(density(Customer_segment$Annual.Income..k..),
    col="Blue",
    main="Density Plot for Annual Income",
    xlab="Annual Income ",
    ylab="Density")
polygon(density(Customer_segment$Annual.Income..k..),
        col="Blue")

boxplot(Customer_segment$Spending.Score..1.100.,
   horizontal=TRUE,
   col="#990000",
   main="BoxPlot for Descriptive Analysis of Spending Score")

K means clustering algorithm

As the name suggest , it is a clustering algorithm. K - means is one of the most popular Clustering algorithm. NUmber of optimal clusters will be defined by user.

There are three methods to choose optimal number of customers: 1. Elbow method 2. Silhouette method 3. Gap Statistic.

Elbow method

library(purrr)
set.seed(123)
# function to calculate total intra-cluster sum of square 
iss <- function(k) {
  kmeans(Customer_segment[,3:5],k,iter.max=100,nstart=100,algorithm="Lloyd" )$tot.withinss
}

k.values <- 1:10


iss_values <- map_dbl(k.values, iss)

plot(k.values, iss_values,
    type="b", pch = 19, frame = FALSE, 
    xlab="Number of clusters K",
    ylab="Total intra-clusters sum of squares")

Silhouette method

library(cluster) 
library(gridExtra)
library(grid)


k2<-kmeans(Customer_segment[,3:5],2,iter.max=100,nstart=50,algorithm="Lloyd")
s2<-plot(silhouette(k2$cluster,dist(Customer_segment[,3:5],"euclidean")))

k3<-kmeans(Customer_segment[,3:5],3,iter.max=100,nstart=50,algorithm="Lloyd")
s3<-plot(silhouette(k3$cluster,dist(Customer_segment[,3:5],"euclidean")))

k4<-kmeans(Customer_segment[,3:5],4,iter.max=100,nstart=50,algorithm="Lloyd")
s4<-plot(silhouette(k4$cluster,dist(Customer_segment[,3:5],"euclidean")))

k5<-kmeans(Customer_segment[,3:5],5,iter.max=100,nstart=50,algorithm="Lloyd")
s5<-plot(silhouette(k5$cluster,dist(Customer_segment[,3:5],"euclidean")))

k6<-kmeans(Customer_segment[,3:5],6,iter.max=100,nstart=50,algorithm="Lloyd")
s6<-plot(silhouette(k6$cluster,dist(Customer_segment[,3:5],"euclidean")))

k7<-kmeans(Customer_segment[,3:5],7,iter.max=100,nstart=50,algorithm="Lloyd")
s7<-plot(silhouette(k7$cluster,dist(Customer_segment[,3:5],"euclidean")))

k8<-kmeans(Customer_segment[,3:5],8,iter.max=100,nstart=50,algorithm="Lloyd")
s8<-plot(silhouette(k8$cluster,dist(Customer_segment[,3:5],"euclidean")))

k9<-kmeans(Customer_segment[,3:5],9,iter.max=100,nstart=50,algorithm="Lloyd")
s9<-plot(silhouette(k9$cluster,dist(Customer_segment[,3:5],"euclidean")))

k10<-kmeans(Customer_segment[,3:5],10,iter.max=100,nstart=50,algorithm="Lloyd")
s10<-plot(silhouette(k10$cluster,dist(Customer_segment[,3:5],"euclidean")))

library(NbClust)
library(factoextra)

## Warning: package 'factoextra' was built under R version 3.6.3

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

fviz_nbclust(Customer_segment[,3:5], kmeans, method = "silhouette")

Gap Statistic Method

set.seed(125)
stat_gap <- clusGap(Customer_segment[,3:5], FUN = kmeans, nstart = 25,
            K.max = 10, B = 50)
fviz_gap_stat(stat_gap)

Assuming K=6 as optimal number of clsuters.

k6<-kmeans(Customer_segment[,3:5],6,iter.max=100,nstart=50,algorithm="Lloyd")
k6

## K-means clustering with 6 clusters of sizes 45, 22, 21, 38, 35, 39
## 
## Cluster means:
##        Age Annual.Income..k.. Spending.Score..1.100.
## 1 56.15556           53.37778               49.08889
## 2 25.27273           25.72727               79.36364
## 3 44.14286           25.14286               19.52381
## 4 27.00000           56.65789               49.13158
## 5 41.68571           88.22857               17.28571
## 6 32.69231           86.53846               82.12821
## 
## Clustering vector:
##   [1] 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3
##  [36] 2 3 2 3 2 1 2 1 4 3 2 1 4 4 4 1 4 4 1 1 1 1 1 4 1 1 4 1 1 1 4 1 1 4 4
##  [71] 1 1 1 1 1 4 1 4 4 1 1 4 1 1 4 1 1 4 4 1 1 4 1 4 4 4 1 4 1 4 4 1 1 4 1
## [106] 4 1 1 1 1 1 4 4 4 4 4 1 1 1 1 4 4 4 6 4 6 5 6 5 6 5 6 4 6 5 6 5 6 5 6
## [141] 5 6 4 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5
## [176] 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6
## 
## Within cluster sum of squares by cluster:
## [1]  8062.133  4099.818  7732.381  7742.895 16690.857 13972.359
##  (between_SS / total_SS =  81.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

Visualizing the clusters.

pcclust=prcomp(Customer_segment[,3:5],scale=FALSE) #principal component analysis
summary(pcclust)

## Importance of components:
##                            PC1     PC2     PC3
## Standard deviation     26.4625 26.1597 12.9317
## Proportion of Variance  0.4512  0.4410  0.1078
## Cumulative Proportion   0.4512  0.8922  1.0000

pcclust$rotation[,1:2]

##                               PC1        PC2
## Age                     0.1889742 -0.1309652
## Annual.Income..k..     -0.5886410 -0.8083757
## Spending.Score..1.100. -0.7859965  0.5739136

set.seed(1)
ggplot(Customer_segment, aes(x =Annual.Income..k.., y = Spending.Score..1.100.)) + 
  geom_point(stat = "identity", aes(color = as.factor(k6$cluster))) +
  scale_color_discrete(name=" ",
              breaks=c("1", "2", "3", "4", "5","6"),
              labels=c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5","Cluster 6")) +
  ggtitle("Segments of Mall Customers", subtitle = "Using K-means Clustering")

ggplot(Customer_segment, aes(x =Spending.Score..1.100., y =Age)) + 
  geom_point(stat = "identity", aes(color = as.factor(k6$cluster))) +
  scale_color_discrete(name=" ",
                      breaks=c("1", "2", "3", "4", "5","6"),
                      labels=c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5","Cluster 6")) +
  ggtitle("Segments of Mall Customers", subtitle = "Using K-means Clustering")

kCols=function(vec){cols=rainbow (length (unique (vec)))
return (cols[as.numeric(as.factor(vec))])}

digCluster<-k6$cluster; dignm<-as.character(digCluster); # K-means clusters

plot(pcclust$x[,1:2], col =kCols(digCluster),pch =19,xlab ="K-means",ylab="classes")
legend("bottomleft",unique(dignm),fill=unique(kCols(digCluster)))

Outcome: 1. Cluster 1: Represents customers having high income and high spend.
2. Cluster 2: Depicts high annual income and low yearly spend.
3. Cluster 3: Low annual income and low spend.
4. Cluster 4 and 6: Medium income with medium salary.
5. Cluster 5: Low income with high expenditure.