Chapman, C., & Feit, E. M. (2019). R For Marketing Research and Analytics. Springer. https://link.springer.com/book/10.1007%2F978-3-030-14316-9
from 11.1 to 11.3.7 from page 299 to page 317
seg.raw <- read.csv("http://goo.gl/qw303p")
seg.df <- seg.raw[ , -7] # remove the known segment assignments
head(seg.df)
## age gender income kids ownHome subscribe
## 1 47.31613 Male 49482.81 2 ownNo subNo
## 2 31.38684 Male 35546.29 1 ownYes subNo
## 3 43.20034 Male 44169.19 0 ownYes subNo
## 4 37.31700 Female 81041.99 1 ownNo subNo
## 5 40.95439 Female 79353.01 3 ownYes subNo
## 6 43.03387 Male 58143.36 4 ownYes subNo
summary(seg.df)
## age gender income kids ownHome
## Min. :19.26 Female:157 Min. : -5183 Min. :0.00 ownNo :159
## 1st Qu.:33.01 Male :143 1st Qu.: 39656 1st Qu.:0.00 ownYes:141
## Median :39.49 Median : 52014 Median :1.00
## Mean :41.20 Mean : 50937 Mean :1.27
## 3rd Qu.:47.90 3rd Qu.: 61403 3rd Qu.:2.00
## Max. :80.49 Max. :114278 Max. :7.00
## subscribe
## subNo :260
## subYes: 40
##
##
##
##
seg.summ <- function(data , groups) {aggregate(data , list(groups), function(x) mean(as.numeric(x)))}
This function first splits the data by reported group
seg.summ(seg.df , seg.raw$Segment)
## Group.1 age gender income kids ownHome subscribe
## 1 Moving up 36.33114 1.30 53090.97 1.914286 1.328571 1.200
## 2 Suburb mix 39.92815 1.52 55033.82 1.920000 1.480000 1.060
## 3 Travelers 57.87088 1.50 62213.94 0.000000 1.750000 1.125
## 4 Urban hip 23.88459 1.60 21681.93 1.100000 1.200000 1.200
d <- dist(seg.df[, c("age", "income", "kids")])
as.matrix(d)[1:5, 1:5]
## 1 2 3 4 5
## 1 0.000 13936.531 5313.626 31559.178 29870.205
## 2 13936.531 0.000 8622.906 45495.698 43806.727
## 3 5313.626 8622.906 0.000 36872.800 35183.828
## 4 31559.178 45495.698 36872.800 0.000 1688.977
## 5 29870.205 43806.727 35183.828 1688.977 0.000
library(cluster) # daisy works with mixed data types
## Warning: package 'cluster' was built under R version 3.6.2
seg.dist <- daisy(seg.df)
as.matrix(seg.dist)[1:5, 1:5]
## 1 2 3 4 5
## 1 0.0000000 0.2532815 0.2329028 0.2617250 0.4161338
## 2 0.2532815 0.0000000 0.0679978 0.4129493 0.3014468
## 3 0.2329028 0.0679978 0.0000000 0.4246012 0.2932957
## 4 0.2617250 0.4129493 0.4246012 0.0000000 0.2265436
## 5 0.4161338 0.3014468 0.2932957 0.2265436 0.0000000
seg.hc <- hclust(seg.dist , method="complete")
plot(seg.hc)
plot(cut(as.dendrogram(seg.hc), h=0.5)$lower [[1]])
seg.df[c(101, 107), ] # similar
## age gender income kids ownHome subscribe
## 101 24.73796 Male 18457.85 1 ownNo subYes
## 107 23.19013 Male 17510.28 1 ownNo subYes
seg.df[c(278, 294), ] # similar
## age gender income kids ownHome subscribe
## 278 36.23860 Female 46540.88 1 ownNo subYes
## 294 35.79961 Female 52352.69 1 ownNo subYes
seg.df[c(298,256, 287), ] # similar
## age gender income kids ownHome subscribe
## 298 38.22980 Male 47580.93 0 ownNo subYes
## 256 39.33158 Male 52110.46 1 ownNo subYes
## 287 36.24118 Male 49775.29 1 ownNo subYes
seg.df[c(128,173, 141), ] # less similar
## age gender income kids ownHome subscribe
## 128 21.80737 Male 27807.24 2 ownNo subYes
## 173 64.70641 Male 45517.15 0 ownNo subYes
## 141 25.17703 Female 20125.80 2 ownNo subYes
One method is the cophenetic correlation coefficient (CPCC), which assesses how well a dendrogram (in this case seg.hc) matches the true distance metric (seg.dist)
CPCC is interpreted similarly to Pearson’s r . In this case, CPCC > 0.7 indicates. a relatively strong fit, meaning that the hierarchical tree represents the distances between customers well.
cor(cophenetic(seg.hc), seg.dist)
## [1] 0.7682436
#
plot(seg.hc)
rect.hclust(seg.hc , k=4, border="red")
seg.hc.segment <- cutree(seg.hc , k=4) # membership vector for 4 groups
table(seg.hc.segment)
## seg.hc.segment
## 1 2 3 4
## 124 136 18 22
seg.summ(seg.df , seg.hc.segment)
## Group.1 age gender income kids ownHome subscribe
## 1 1 40.78456 2.000000 49454.08 1.314516 1.467742 1
## 2 2 42.03492 1.000000 53759.62 1.235294 1.477941 1
## 3 3 44.31194 1.388889 52628.42 1.388889 2.000000 2
## 4 4 35.82935 1.545455 40456.14 1.136364 1.000000 2
plot(jitter(as.numeric(seg.df$gender)) ~
jitter(as.numeric(seg.df$subscribe)),col=seg.hc.segment , yaxt="n", xaxt="n", ylab="", xlab="")
axis(1, at=c(1, 2), labels=c("Subscribe: No", "Subscribe: Yes"))
axis(2, at=c(1, 2), labels=levels(seg.df$gender))
### 11.3.4 Mean-Based Clustering: kmeans()
seg.df.num <- seg.df
seg.df.num$gender <- ifelse(seg.df$gender =="Male", 0, 1)
seg.df.num$ownHome <- ifelse(seg.df$ownHome =="ownNo", 0, 1)
seg.df.num$subscribe <- ifelse(seg.df$subscribe =="subNo", 0, 1)
summary(seg.df.num)
## age gender income kids
## Min. :19.26 Min. :0.0000 Min. : -5183 Min. :0.00
## 1st Qu.:33.01 1st Qu.:0.0000 1st Qu.: 39656 1st Qu.:0.00
## Median :39.49 Median :1.0000 Median : 52014 Median :1.00
## Mean :41.20 Mean :0.5233 Mean : 50937 Mean :1.27
## 3rd Qu.:47.90 3rd Qu.:1.0000 3rd Qu.: 61403 3rd Qu.:2.00
## Max. :80.49 Max. :1.0000 Max. :114278 Max. :7.00
## ownHome subscribe
## Min. :0.00 Min. :0.0000
## 1st Qu.:0.00 1st Qu.:0.0000
## Median :0.00 Median :0.0000
## Mean :0.47 Mean :0.1333
## 3rd Qu.:1.00 3rd Qu.:0.0000
## Max. :1.00 Max. :1.0000
set.seed (96743)
seg.k <- kmeans(seg.df.num , centers =4)
seg.summ(seg.df , seg.k$cluster)
## Group.1 age gender income kids ownHome subscribe
## 1 1 29.58704 1.571429 21631.79 1.063492 1.301587 1.158730
## 2 2 55.40968 1.400000 89959.96 0.360000 1.840000 1.120000
## 3 3 42.38909 1.469565 47799.84 1.434783 1.530435 1.165217
## 4 4 43.66931 1.443299 63630.70 1.443299 1.412371 1.082474
boxplot(seg.df.num$income ~ seg.k$cluster , ylab="Income", xlab="Cluster")
library(cluster)
clusplot(seg.df , seg.k$cluster , color=TRUE , shade=TRUE ,labels=4, lines=0, main="K-means cluster plot")
library(mclust)
## Warning: package 'mclust' was built under R version 3.6.2
## Package 'mclust' version 5.4.5
## Type 'citation("mclust")' for citing this R package in publications.
seg.mc <- Mclust(seg.df.num)
summary(seg.mc)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VEV (ellipsoidal, equal shape) model with 3 components:
##
## log-likelihood n df BIC ICL
## -5137.106 300 73 -10690.59 -10690.59
##
## Clustering table:
## 1 2 3
## 163 71 66
seg.mc4 <- Mclust(seg.df.num , G=4)
summary(seg.mc4)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VII (spherical, varying volume) model with 4 components:
##
## log-likelihood n df BIC ICL
## -16862.69 300 31 -33902.19 -33906.18
##
## Clustering table:
## 1 2 3 4
## 104 66 59 71
BIC(seg.mc , seg.mc4)
## df BIC
## seg.mc 73 10690.59
## seg.mc4 31 33902.19
mclustBIC(seg.df.num)
## Bayesian Information Criterion (BIC):
## EII VII EEI VEI EVI VVI EEE
## 1 -37594.16 -37594.16 -11369.71 -11369.71 -11369.71 -11369.71 -11261.94
## 2 -36337.62 -36278.17 -11312.03 -11228.35 NA NA -11302.11
## 3 -35132.35 -34841.45 NA -10822.91 NA NA NA
## 4 -34321.30 -33902.19 NA NA NA NA NA
## 5 -33743.24 -33687.95 NA NA NA NA NA
## 6 -33351.92 -32949.37 NA NA NA NA NA
## 7 -33082.08 -32531.97 NA NA NA NA NA
## 8 -32609.96 -32158.18 NA NA NA NA NA
## 9 -32460.92 -31904.89 NA NA NA NA NA
## EVE VEE VVE EEV VEV EVV VVV
## 1 -11261.94 -11261.94 -11261.94 -11261.94 -11261.94 -11261.94 -11261.94
## 2 NA NA NA -11293.12 -11174.77 NA NA
## 3 NA NA NA NA -10690.59 NA NA
## 4 NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA
## 7 NA NA NA NA NA NA NA
## 8 NA NA NA NA NA NA NA
## 9 NA NA NA NA NA NA NA
##
## Top 3 models based on the BIC criterion:
## VEV,3 VEI,3 VEV,2
## -10690.59 -10822.91 -11174.77
seg.summ(seg.df , seg.mc$class)
## Group.1 age gender income kids ownHome subscribe
## 1 1 44.68018 1.472393 52980.52 1.171779 1.865031 1.245399
## 2 2 38.02229 1.000000 51550.98 1.422535 1.000000 1.000000
## 3 3 36.02187 2.000000 45227.51 1.348485 1.000000 1.000000
clusplot(seg.df , seg.mc$class , color=TRUE , shade=TRUE ,labels=4, lines=0, main="Model -based cluster plot")