chapter11 Segmentation: Clustering and Classification from R For Marketing Research and Analytics 2019

11.3 Clustering

11.3.1 The Steps of Clustering

seg.summ <- function(data , groups) {aggregate(data , list(groups), function(x) mean(as.numeric(x)))}

This function first splits the data by reported group

seg.summ(seg.df , seg.raw$Segment)

##      Group.1      age gender   income     kids  ownHome subscribe
## 1  Moving up 36.33114   1.30 53090.97 1.914286 1.328571     1.200
## 2 Suburb mix 39.92815   1.52 55033.82 1.920000 1.480000     1.060
## 3  Travelers 57.87088   1.50 62213.94 0.000000 1.750000     1.125
## 4  Urban hip 23.88459   1.60 21681.93 1.100000 1.200000     1.200

11.3.2 Hierarchical Clustering: hclust() Basics

d <- dist(seg.df[, c("age", "income", "kids")])
as.matrix(d)[1:5, 1:5]

##           1         2         3         4         5
## 1     0.000 13936.531  5313.626 31559.178 29870.205
## 2 13936.531     0.000  8622.906 45495.698 43806.727
## 3  5313.626  8622.906     0.000 36872.800 35183.828
## 4 31559.178 45495.698 36872.800     0.000  1688.977
## 5 29870.205 43806.727 35183.828  1688.977     0.000

library(cluster) # daisy works with mixed data types

## Warning: package 'cluster' was built under R version 3.6.2

seg.dist <- daisy(seg.df)

as.matrix(seg.dist)[1:5, 1:5]

##           1         2         3         4         5
## 1 0.0000000 0.2532815 0.2329028 0.2617250 0.4161338
## 2 0.2532815 0.0000000 0.0679978 0.4129493 0.3014468
## 3 0.2329028 0.0679978 0.0000000 0.4246012 0.2932957
## 4 0.2617250 0.4129493 0.4246012 0.0000000 0.2265436
## 5 0.4161338 0.3014468 0.2932957 0.2265436 0.0000000

seg.hc <- hclust(seg.dist , method="complete")

plot(seg.hc)

plot(cut(as.dendrogram(seg.hc), h=0.5)$lower [[1]])

seg.df[c(101, 107), ] # similar

##          age gender   income kids ownHome subscribe
## 101 24.73796   Male 18457.85    1   ownNo    subYes
## 107 23.19013   Male 17510.28    1   ownNo    subYes

seg.df[c(278, 294), ] # similar

##          age gender   income kids ownHome subscribe
## 278 36.23860 Female 46540.88    1   ownNo    subYes
## 294 35.79961 Female 52352.69    1   ownNo    subYes

seg.df[c(298,256, 287), ] # similar

##          age gender   income kids ownHome subscribe
## 298 38.22980   Male 47580.93    0   ownNo    subYes
## 256 39.33158   Male 52110.46    1   ownNo    subYes
## 287 36.24118   Male 49775.29    1   ownNo    subYes

seg.df[c(128,173, 141), ] # less similar

##          age gender   income kids ownHome subscribe
## 128 21.80737   Male 27807.24    2   ownNo    subYes
## 173 64.70641   Male 45517.15    0   ownNo    subYes
## 141 25.17703 Female 20125.80    2   ownNo    subYes

One method is the cophenetic correlation coefficient (CPCC), which assesses how well a dendrogram (in this case seg.hc) matches the true distance metric (seg.dist)

CPCC is interpreted similarly to Pearson’s r . In this case, CPCC > 0.7 indicates. a relatively strong fit, meaning that the hierarchical tree represents the distances between customers well.

cor(cophenetic(seg.hc), seg.dist)

## [1] 0.7682436

11.3.3 Hierarchical Clustering Continued: Groups from hclust()

plot(seg.hc)
rect.hclust(seg.hc , k=4, border="red")

seg.hc.segment <- cutree(seg.hc , k=4) # membership vector for 4 groups
table(seg.hc.segment)

## seg.hc.segment
##   1   2   3   4 
## 124 136  18  22

seg.summ(seg.df , seg.hc.segment)

##   Group.1      age   gender   income     kids  ownHome subscribe
## 1       1 40.78456 2.000000 49454.08 1.314516 1.467742         1
## 2       2 42.03492 1.000000 53759.62 1.235294 1.477941         1
## 3       3 44.31194 1.388889 52628.42 1.388889 2.000000         2
## 4       4 35.82935 1.545455 40456.14 1.136364 1.000000         2

plot(jitter(as.numeric(seg.df$gender)) ~
       jitter(as.numeric(seg.df$subscribe)),col=seg.hc.segment , yaxt="n", xaxt="n", ylab="", xlab="")
axis(1, at=c(1, 2), labels=c("Subscribe: No", "Subscribe: Yes"))
axis(2, at=c(1, 2), labels=levels(seg.df$gender))

### 11.3.4 Mean-Based Clustering: kmeans()

seg.df.num <- seg.df
seg.df.num$gender <- ifelse(seg.df$gender =="Male", 0, 1)
seg.df.num$ownHome <- ifelse(seg.df$ownHome =="ownNo", 0, 1)
seg.df.num$subscribe <- ifelse(seg.df$subscribe =="subNo", 0, 1)
summary(seg.df.num)

##       age            gender           income            kids     
##  Min.   :19.26   Min.   :0.0000   Min.   : -5183   Min.   :0.00  
##  1st Qu.:33.01   1st Qu.:0.0000   1st Qu.: 39656   1st Qu.:0.00  
##  Median :39.49   Median :1.0000   Median : 52014   Median :1.00  
##  Mean   :41.20   Mean   :0.5233   Mean   : 50937   Mean   :1.27  
##  3rd Qu.:47.90   3rd Qu.:1.0000   3rd Qu.: 61403   3rd Qu.:2.00  
##  Max.   :80.49   Max.   :1.0000   Max.   :114278   Max.   :7.00  
##     ownHome       subscribe     
##  Min.   :0.00   Min.   :0.0000  
##  1st Qu.:0.00   1st Qu.:0.0000  
##  Median :0.00   Median :0.0000  
##  Mean   :0.47   Mean   :0.1333  
##  3rd Qu.:1.00   3rd Qu.:0.0000  
##  Max.   :1.00   Max.   :1.0000

set.seed (96743)
seg.k <- kmeans(seg.df.num , centers =4)

seg.summ(seg.df , seg.k$cluster)

##   Group.1      age   gender   income     kids  ownHome subscribe
## 1       1 29.58704 1.571429 21631.79 1.063492 1.301587  1.158730
## 2       2 55.40968 1.400000 89959.96 0.360000 1.840000  1.120000
## 3       3 42.38909 1.469565 47799.84 1.434783 1.530435  1.165217
## 4       4 43.66931 1.443299 63630.70 1.443299 1.412371  1.082474

boxplot(seg.df.num$income ~ seg.k$cluster , ylab="Income", xlab="Cluster")

library(cluster)
clusplot(seg.df , seg.k$cluster , color=TRUE , shade=TRUE ,labels=4, lines=0, main="K-means cluster plot")

11.3.5 Model-Based Clustering: Mclust()

library(mclust)

## Warning: package 'mclust' was built under R version 3.6.2

## Package 'mclust' version 5.4.5
## Type 'citation("mclust")' for citing this R package in publications.

seg.mc <- Mclust(seg.df.num)
summary(seg.mc)

## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VEV (ellipsoidal, equal shape) model with 3 components: 
## 
##  log-likelihood   n df       BIC       ICL
##       -5137.106 300 73 -10690.59 -10690.59
## 
## Clustering table:
##   1   2   3 
## 163  71  66

seg.mc4 <- Mclust(seg.df.num , G=4)
summary(seg.mc4)

## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VII (spherical, varying volume) model with 4 components: 
## 
##  log-likelihood   n df       BIC       ICL
##       -16862.69 300 31 -33902.19 -33906.18
## 
## Clustering table:
##   1   2   3   4 
## 104  66  59  71

11.3.6 Comparing Models with BIC()

BIC(seg.mc , seg.mc4)

##         df      BIC
## seg.mc  73 10690.59
## seg.mc4 31 33902.19

mclustBIC(seg.df.num)

## Bayesian Information Criterion (BIC): 
##         EII       VII       EEI       VEI       EVI       VVI       EEE
## 1 -37594.16 -37594.16 -11369.71 -11369.71 -11369.71 -11369.71 -11261.94
## 2 -36337.62 -36278.17 -11312.03 -11228.35        NA        NA -11302.11
## 3 -35132.35 -34841.45        NA -10822.91        NA        NA        NA
## 4 -34321.30 -33902.19        NA        NA        NA        NA        NA
## 5 -33743.24 -33687.95        NA        NA        NA        NA        NA
## 6 -33351.92 -32949.37        NA        NA        NA        NA        NA
## 7 -33082.08 -32531.97        NA        NA        NA        NA        NA
## 8 -32609.96 -32158.18        NA        NA        NA        NA        NA
## 9 -32460.92 -31904.89        NA        NA        NA        NA        NA
##         EVE       VEE       VVE       EEV       VEV       EVV       VVV
## 1 -11261.94 -11261.94 -11261.94 -11261.94 -11261.94 -11261.94 -11261.94
## 2        NA        NA        NA -11293.12 -11174.77        NA        NA
## 3        NA        NA        NA        NA -10690.59        NA        NA
## 4        NA        NA        NA        NA        NA        NA        NA
## 5        NA        NA        NA        NA        NA        NA        NA
## 6        NA        NA        NA        NA        NA        NA        NA
## 7        NA        NA        NA        NA        NA        NA        NA
## 8        NA        NA        NA        NA        NA        NA        NA
## 9        NA        NA        NA        NA        NA        NA        NA
## 
## Top 3 models based on the BIC criterion: 
##     VEV,3     VEI,3     VEV,2 
## -10690.59 -10822.91 -11174.77

seg.summ(seg.df , seg.mc$class)

##   Group.1      age   gender   income     kids  ownHome subscribe
## 1       1 44.68018 1.472393 52980.52 1.171779 1.865031  1.245399
## 2       2 38.02229 1.000000 51550.98 1.422535 1.000000  1.000000
## 3       3 36.02187 2.000000 45227.51 1.348485 1.000000  1.000000

clusplot(seg.df , seg.mc$class , color=TRUE , shade=TRUE ,labels=4, lines=0, main="Model -based cluster plot")

chapter11 Segmentation: Clustering and Classification from R For Marketing Research and Analytics 2019

Kai Wu

2020/2/24