# NK Project 3 Cluster Analysis
# Part I -
# Determining number of clusters
# Experimenting with k-means vs hierarchical
# PREP #########################################################################
# Packages
pacman::p_load(
pacman, rio, tidyverse, magrittr, janitor, # general stuff
psych, # EDA
visdat, # missingness
data.table, # working with data.tables
corrplot, # correlation plot
FactoMineR, # EDA, PCA, MFA
factoextra, # extract and visualize PCA/MFA
nFactors, # how many factors/components to retain
cluster, # clustering algorithms
NbClust, # number of clusters
clValid # ?
)
# Data
data <- import("data/Travel_Review.xlsx", range = "A1:Y5457",
na = "0")
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in L2714 / R2714C12: got '2 2.'
# clean variable names
data %<>% rename(SwimmingPools = `Swimming Pools`)
# drop missings
complete <- data[complete.cases(data),]
# standardize by user
data2 <- complete
data2[,-1] <- t(data2[,-1]) %>% scale() %>% t()
# PCA
# X <- subset(data2, select = -c(UserID))
X <- subset(data2, select = -c(UserID, DanceClubs))
pca <- prcomp(X, center = TRUE, scale. = TRUE)
pca_label <- cbind(data2, as.data.frame(pca$x))
# EDA / VIZ ####################################################################
data3 <- pca_label %>%
dplyr::select(PC1:PC7, DanceClubs) %>%
scale() %>% as.data.frame()
str(data3)
## 'data.frame': 3724 obs. of 8 variables:
## $ PC1 : num -1.059 -0.674 -1.026 -1.154 -0.504 ...
## $ PC2 : num 0.732 1.332 0.539 0.467 0.736 ...
## $ PC3 : num -0.222 0.537 -0.17 -0.453 -0.602 ...
## $ PC4 : num -0.451 -0.172 -0.703 -0.396 0.151 ...
## $ PC5 : num -1.72 -1.7 -2.15 -1.49 -1.37 ...
## $ PC6 : num 0.766 1.284 0.621 0.244 1.314 ...
## $ PC7 : num -1.011 0.391 -0.849 -0.827 -0.641 ...
## $ DanceClubs: num 0.594 0.417 0.48 0.544 0.389 ...
psych::describe(data3)
## vars n mean sd median trimmed mad min max range skew
## PC1 1 3724 0 1 -0.18 -0.08 1.10 -1.52 2.55 4.06 0.57
## PC2 2 3724 0 1 0.16 0.05 1.00 -2.76 1.84 4.60 -0.44
## PC3 3 3724 0 1 0.13 0.03 1.03 -2.40 2.95 5.36 -0.24
## PC4 4 3724 0 1 -0.04 -0.02 1.18 -2.38 2.72 5.10 0.10
## PC5 5 3724 0 1 0.04 0.06 0.95 -4.57 2.41 6.98 -0.81
## PC6 6 3724 0 1 0.07 0.02 0.99 -3.14 3.09 6.23 -0.20
## PC7 7 3724 0 1 -0.03 -0.01 1.07 -3.09 4.09 7.18 0.17
## DanceClubs 8 3724 0 1 -0.28 -0.25 0.26 -1.29 5.55 6.84 3.08
## kurtosis se
## PC1 -0.80 0.02
## PC2 -0.58 0.02
## PC3 -0.66 0.02
## PC4 -0.81 0.02
## PC5 1.57 0.02
## PC6 0.02 0.02
## PC7 0.01 0.02
## DanceClubs 8.95 0.02
# visualize distance matrix
distance <- get_dist(data3)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

# NUMBER OF CLUSTERS ###########################################################
# using NbClust()
# method options: kmeans, ward.D2, average, median, centroid, mcquitty (also single & complete but Dr D doesn't recommend)
sample_data <- slice_sample(data3, prop = 0.1)
nb <- NbClust(sample_data, distance = "manhattan",
min.nc = 2, max.nc = 15, method = "centroid")
## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 8 proposed 2 as the best number of clusters
## * 2 proposed 3 as the best number of clusters
## * 3 proposed 4 as the best number of clusters
## * 8 proposed 5 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 1 proposed 11 as the best number of clusters
## * 1 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
fviz_nbclust(nb)
## Warning in if (class(best_nc) == "numeric") print(best_nc) else if
## (class(best_nc) == : the condition has length > 1 and only the first element
## will be used
## Warning in if (class(best_nc) == "matrix") .viz_NbClust(x, print.summary, : the
## condition has length > 1 and only the first element will be used
## Warning in if (class(best_nc) == "numeric") print(best_nc) else if
## (class(best_nc) == : the condition has length > 1 and only the first element
## will be used
## Warning in if (class(best_nc) == "matrix") {: the condition has length > 1 and
## only the first element will be used
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 8 proposed 2 as the best number of clusters
## * 2 proposed 3 as the best number of clusters
## * 3 proposed 4 as the best number of clusters
## * 8 proposed 5 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 1 proposed 11 as the best number of clusters
## * 1 proposed 15 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 2 .

# using clValid()
# method options: hierarchical, kmeans, diana, agnes, pam (plus others but idk what they are)
clmethods <- c("hierarchical", "kmeans", "pam", "clara",
"diana", "agnes")
sample_data <- slice_sample(data3, n = 600)
intern <- clValid(sample_data, nClust = 3:8,
clMethods = clmethods, validation = "stability")
summary(intern)
##
## Clustering Methods:
## hierarchical kmeans pam clara diana agnes
##
## Cluster sizes:
## 3 4 5 6 7 8
##
## Validation Measures:
## 3 4 5 6 7 8
##
## hierarchical APN 0.0405 0.0923 0.1055 0.0865 0.1325 0.1547
## AD 3.6178 3.4932 3.4172 3.3203 3.2509 3.1052
## ADM 0.4073 0.3187 0.5414 0.5650 0.5790 0.6564
## FOM 0.9731 0.9583 0.9537 0.9482 0.9437 0.9323
## kmeans APN 0.0971 0.1341 0.2895 0.4013 0.4542 0.4657
## AD 3.4853 3.4534 3.3670 3.3440 3.2707 3.1580
## ADM 0.3121 0.3902 0.8961 1.1643 1.1899 1.3552
## FOM 0.9859 0.9775 0.9749 0.9704 0.9679 0.9621
## pam APN 0.4080 0.3413 0.3603 0.3750 0.4197 0.3359
## AD 3.6466 3.4751 3.2615 3.1381 3.0182 2.8041
## ADM 1.1799 1.1450 1.0972 1.1605 1.2113 0.9560
## FOM 0.9992 1.0000 0.9963 0.9786 0.9564 0.9389
## clara APN 0.4105 0.4794 0.4815 0.4823 0.4506 0.4487
## AD 3.6453 3.5891 3.4585 3.3059 3.1644 2.9896
## ADM 1.0760 1.3443 1.4103 1.4153 1.3707 1.2469
## FOM 0.9963 0.9881 0.9884 0.9779 0.9612 0.9392
## diana APN 0.1176 0.2316 0.3220 0.3342 0.3512 0.3871
## AD 3.4958 3.3959 3.3665 3.3084 3.2039 3.1053
## ADM 0.3542 0.8408 0.9305 0.9499 1.1676 1.2007
## FOM 0.9854 0.9695 0.9649 0.9613 0.9562 0.9505
## agnes APN 0.0405 0.0923 0.1055 0.0865 0.1325 0.1547
## AD 3.6178 3.4932 3.4172 3.3203 3.2509 3.1052
## ADM 0.4073 0.3187 0.5414 0.5650 0.5790 0.6564
## FOM 0.9731 0.9583 0.9537 0.9482 0.9437 0.9323
##
## Optimal Scores:
##
## Score Method Clusters
## APN 0.0405 hierarchical 3
## AD 2.8041 pam 8
## ADM 0.3121 kmeans 3
## FOM 0.9323 hierarchical 8
# using fviz_nbclust()
# kmeans
sample_data <- slice_sample(data3, prop = 0.1)
fviz_nbclust(sample_data, kmeans, k.max = 20, method = "wss") +
labs(subtitle = "Elbow method") # 8 or 14?

sample_data <- slice_sample(data3, prop = 0.1)
fviz_nbclust(sample_data, kmeans, k.max = 20, method = "silhouette") +
labs(subtitle = "Silhouette method") # 9

sample_data <- slice_sample(data3, prop = 0.1)
fviz_nbclust(sample_data, kmeans, k.max = 15, method = "gap_stat",
nboot = 50, iter.max = 30) + # 9
labs(subtitle = "Gap statistic method")

# pam, runs much slower than kmeans
sample_data <- slice_sample(data3, prop = 0.1)
fviz_nbclust(sample_data, pam, k.max = 20, method = "wss") +
labs(subtitle = "Elbow method") # 9?

sample_data <- slice_sample(data3, prop = 0.1)
fviz_nbclust(sample_data, pam, k.max = 20, method = "silhouette") +
labs(subtitle = "Silhouette method") # 9, 11

sample_data <- slice_sample(data3, prop = 0.1)
fviz_nbclust(sample_data, pam, k.max = 15, method = "gap_stat",
nboot = 50) + # 11?
labs(subtitle = "Gap statistic method")

# using clusGap()
sample_data <- slice_sample(data3, prop = 0.1)
gap_stat <- clusGap(sample_data, FUN = kmeans, nstart = 25,
K.max = 20, B = 25, iter.max = 30)
fviz_gap_stat(gap_stat,
maxSE = list(method = "globalSEmax", SE.factor = 1)) # gives k=18

print(gap_stat, method = "globalSEmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = sample_data, FUNcluster = kmeans, K.max = 20, B = 25, nstart = 25, iter.max = 30)
## B=25 simulated reference sets, k = 1..20; spaceH0="scaledPCA"
## --> Number of clusters (method 'globalSEmax', SE.factor=1): 18
## logW E.logW gap SE.sim
## [1,] 5.870364 6.337838 0.4674733 0.008561558
## [2,] 5.813991 6.252201 0.4382103 0.009121331
## [3,] 5.736212 6.207101 0.4708888 0.008959112
## [4,] 5.661468 6.169857 0.5083884 0.009345998
## [5,] 5.596197 6.139139 0.5429424 0.009931472
## [6,] 5.529999 6.111324 0.5813248 0.009817264
## [7,] 5.465115 6.086674 0.6215591 0.009990850
## [8,] 5.421040 6.064374 0.6433344 0.009760220
## [9,] 5.368144 6.044257 0.6761137 0.010323699
## [10,] 5.332850 6.025158 0.6923074 0.010428350
## [11,] 5.297183 6.007438 0.7102550 0.009684115
## [12,] 5.266196 5.990505 0.7243098 0.009747336
## [13,] 5.239722 5.974481 0.7347595 0.009837895
## [14,] 5.212101 5.959934 0.7478328 0.009486505
## [15,] 5.179752 5.946055 0.7663034 0.010093245
## [16,] 5.160807 5.932719 0.7719127 0.010113628
## [17,] 5.140261 5.919091 0.7788293 0.010060472
## [18,] 5.110823 5.906116 0.7952931 0.010631857
## [19,] 5.094283 5.893793 0.7995097 0.010436817
## [20,] 5.078110 5.881818 0.8037080 0.010624455
# HIERARCHICAL CLUSTERING ######################################################
# using hclust()
sample_data <- slice_sample(data3, prop = 0.25)
hclus1 <- hclust(get_dist(sample_data, stand = T), method = "ward.D2")
hclus1
##
## Call:
## hclust(d = get_dist(sample_data, stand = T), method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : euclidean
## Number of objects: 931
# SLOOOOOOOOWWW
fviz_dend(hclus1, show_labels = F)

# suggests 6-10, depending on seed
grp7 <- cutree(hclus1, k = 7)
table(grp7)
## grp7
## 1 2 3 4 5 6 7
## 234 81 175 53 244 70 74
grp10 <- cutree(hclus1, k = 10)
table(grp10)
## grp10
## 1 2 3 4 5 6 7 8 9 10
## 78 81 106 53 81 70 163 156 74 69
grp12 <- cutree(hclus1, k = 12)
table(grp12)
## grp12
## 1 2 3 4 5 6 7 8 9 10 11 12
## 78 81 106 53 81 70 163 110 32 42 69 46
grp2 <- cutree(hclus1, k = 2)
table(grp2)
## grp2
## 1 2
## 857 74
# dendrogram colored by group
fviz_dend(hclus1, k = 6,
cex = 0.5,
palette = "Set3",
show_labels = FALSE,
rect = TRUE
)

# visualize individuals on PC1 vs PC2, not very helpful
fviz_cluster(list(data = sample_data, cluster = grp2),
axes = c(1,2),
palette = "Set3",
ellipse = T,
ellipse.type = "convex", # Concentration ellipse
geom = "point",
show.clust.cent = FALSE, ggtheme = theme_minimal())

# using agnes()
sample_data <- slice_sample(data3, prop = 0.1)
agnes1 <- agnes(sample_data, stand = T,
metric = "euclidean", # "euclidean" or "manhattan"
method = "ward" # try "average" or "ward"
)
fviz_dend(agnes1, show_labels = F)

# agnes_ac <- data.frame(ac = 0.975)
# agnes_ac <- rbind(agnes_ac, agnes1$ac)
# agnes_ac <- cbind(agnes_ac,
# method = c(rep("ward", 10), rep("average", 10)),
# metric = rep(c(rep("euclidean", 5), rep("manhattan", 5)), 2)
# )
# write.csv(agnes_ac, "agnes_ac.csv", row.names = F)
# using diana()
sample_data <- slice_sample(data3, prop = 0.1)
diana1 <- diana(sample_data, stand = T,
metric = "euclidean" # "euclidean" or "manhattan"
)
fviz_dend(diana1, show_labels = F)

# diana_dc <- data.frame(dc = diana1$dc)
# diana_dc <- rbind(diana_dc, diana1$dc)
# diana_dc <- cbind(diana_dc,
# metric = c(rep("euclidean", 5), rep("manhattan", 5))
# )
# write.csv(diana_dc, "diana_dc.csv", row.names = F)
# K-MEANS CLUSTERING ###########################################################
km3 <- kmeans(data3, centers = 3, nstart = 25)
km4 <- kmeans(data3, centers = 4, nstart = 25)
km5 <- kmeans(data3, centers = 5, nstart = 25)
km6 <- kmeans(data3, centers = 6, nstart = 25)
km7 <- kmeans(data3, centers = 7, nstart = 25)
km8 <- kmeans(data3, centers = 8, nstart = 25)
km9 <- kmeans(data3, centers = 9, nstart = 25)
# not helpful
fviz_cluster(km9, data = data3, geom = "point")

fviz_silhouette(silhouette(km3$cluster, dist(data3))) # 0.15, no
## cluster size ave.sil.width
## 1 1 1515 0.14
## 2 2 295 0.27
## 3 3 1914 0.13

fviz_silhouette(silhouette(km4$cluster, dist(data3))) # 0.16, no
## cluster size ave.sil.width
## 1 1 917 0.06
## 2 2 1087 0.24
## 3 3 283 0.27
## 4 4 1437 0.14

fviz_silhouette(silhouette(km5$cluster, dist(data3))) # 0.18
## cluster size ave.sil.width
## 1 1 276 0.24
## 2 2 1105 0.22
## 3 3 917 0.17
## 4 4 764 0.23
## 5 5 662 0.07

fviz_silhouette(silhouette(km6$cluster, dist(data3))) # 0.21
## cluster size ave.sil.width
## 1 1 869 0.25
## 2 2 387 0.23
## 3 3 1101 0.17
## 4 4 201 0.27
## 5 5 905 0.16
## 6 6 261 0.26

fviz_silhouette(silhouette(km7$cluster, dist(data3))) # 0.22
## cluster size ave.sil.width
## 1 1 745 0.21
## 2 2 695 0.18
## 3 3 654 0.28
## 4 4 204 0.26
## 5 5 476 0.17
## 6 6 263 0.23
## 7 7 687 0.23

fviz_silhouette(silhouette(km8$cluster, dist(data3))) # 0.25
## cluster size ave.sil.width
## 1 1 587 0.26
## 2 2 263 0.23
## 3 3 343 0.49
## 4 4 618 0.18
## 5 5 198 0.25
## 6 6 820 0.17
## 7 7 582 0.27
## 8 8 313 0.27

fviz_silhouette(silhouette(km9$cluster, dist(data3))) # 0.26
## cluster size ave.sil.width
## 1 1 248 0.31
## 2 2 537 0.28
## 3 3 511 0.21
## 4 4 698 0.20
## 5 5 375 0.16
## 6 6 175 0.27
## 7 7 262 0.21
## 8 8 326 0.52
## 9 9 592 0.27

km_ss <- data.frame(
k = seq(3, 9, 1),
tot.withinss = c(
km3$tot.withinss,
km4$tot.withinss,
km5$tot.withinss,
km6$tot.withinss,
km7$tot.withinss,
km8$tot.withinss,
km9$tot.withinss
),
betweenss = c(
km3$betweenss,
km4$betweenss,
km5$betweenss,
km6$betweenss,
km7$betweenss,
km8$betweenss,
km9$betweenss
)
)
# write.csv(km_ss, "km_ss.csv", row.names = F)
km_ss %>% mutate(ratio = betweenss/tot.withinss) %>%
ggplot() +
geom_line(aes(k, betweenss), color="blue") + geom_point(aes(k, betweenss)) +
geom_line(aes(k, tot.withinss), color="red") + geom_point(aes(k, tot.withinss))

# geom_line(aes(k, ratio)) + geom_point(aes(k, ratio))
# K-MEDIOIDS CLUSTERING ########################################################
kmed9e <- pam(data3, 9, metric = "euclidean")
kmed9m <- pam(data3, 9, metric = "manhattan")
kmed8e <- pam(data3, 8, metric = "euclidean")
kmed8m <- pam(data3, 8, metric = "manhattan")
fviz_silhouette(silhouette(kmed9e, distance)) # 0.25
## cluster size ave.sil.width
## 1 1 545 0.29
## 2 2 664 0.19
## 3 3 454 0.11
## 4 4 568 0.26
## 5 5 496 0.21
## 6 6 326 0.52
## 7 7 235 0.24
## 8 8 178 0.27
## 9 9 258 0.28

fviz_silhouette(silhouette(kmed9m, distance)) # 0.23
## cluster size ave.sil.width
## 1 1 525 0.30
## 2 2 625 0.11
## 3 3 453 0.14
## 4 4 286 0.21
## 5 5 540 0.26
## 6 6 483 0.20
## 7 7 342 0.42
## 8 8 194 0.26
## 9 9 276 0.26

fviz_silhouette(silhouette(kmed8e, distance)) # 0.23
## cluster size ave.sil.width
## 1 1 592 0.23
## 2 2 937 0.12
## 3 3 535 0.29
## 4 4 513 0.22
## 5 5 334 0.50
## 6 6 361 0.20
## 7 7 246 0.25
## 8 8 206 0.24

fviz_silhouette(silhouette(kmed8m, distance)) # 0.23
## cluster size ave.sil.width
## 1 1 520 0.30
## 2 2 538 0.19
## 3 3 639 0.05
## 4 4 583 0.28
## 5 5 554 0.18
## 6 6 365 0.45
## 7 7 204 0.27
## 8 8 321 0.24

# FINAL (???) MODEL ############################################################
set.seed(7852)
km <- kmeans(data3, centers = 8, nstart = 25)
clus <- cbind(complete, cluster = km$cluster)
head(clus)
# EVALUATING THE MODEL
# comparing medians
clus_medians <- aggregate(x = clus[, 2:25],
by = list(clus$cluster),
FUN = median) %>%
pivot_longer(cols = 2:25, values_to = "median") %>%
rename(cluster = Group.1) %>%
mutate(cluster = as_factor(cluster))
ggplot(clus_medians, aes(cluster, median, fill = cluster)) +
ylim(0,5) + facet_wrap(.~name) + theme_bw() +
geom_col() +
labs(title = "Comparing Clusters by Median Scores for k=8",
subtitle = "DanceClubs included in PCA")
# comparing means
clus_means <- aggregate(x = clus[, 2:25],
by = list(clus$cluster),
FUN = mean) %>%
pivot_longer(cols = 2:25, values_to = "mean") %>%
rename(cluster = Group.1) %>%
mutate(cluster = as_factor(cluster))
ggplot(clus_means, aes(cluster, mean, fill = cluster)) +
ylim(0,5) + facet_wrap(.~name) + theme_bw() +
geom_col()