# NK Project 3 Cluster Analysis
# Part I - 
  # Determining number of clusters
  # Experimenting with k-means vs hierarchical



# PREP #########################################################################
# Packages
pacman::p_load(
  pacman, rio, tidyverse, magrittr, janitor,  # general stuff
  psych,        # EDA
  visdat,       # missingness
  data.table,   # working with data.tables
  corrplot,     # correlation plot
  FactoMineR,   # EDA, PCA, MFA
  factoextra,   # extract and visualize PCA/MFA
  nFactors,     # how many factors/components to retain
  cluster,      # clustering algorithms
  NbClust,      # number of clusters
  clValid       # ?
)


# Data
data <- import("data/Travel_Review.xlsx", range = "A1:Y5457",
               na = "0")
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in L2714 / R2714C12: got '2 2.'
# clean variable names
data %<>% rename(SwimmingPools = `Swimming Pools`)
# drop missings
complete <- data[complete.cases(data),]
# standardize by user
data2 <- complete
data2[,-1] <- t(data2[,-1]) %>% scale() %>% t()

# PCA
# X <- subset(data2, select = -c(UserID))
X <- subset(data2, select = -c(UserID, DanceClubs))
pca <- prcomp(X, center = TRUE, scale. = TRUE)
pca_label <- cbind(data2, as.data.frame(pca$x))
# EDA / VIZ ####################################################################

data3 <- pca_label %>% 
  dplyr::select(PC1:PC7, DanceClubs) %>% 
  scale() %>% as.data.frame()
str(data3)
## 'data.frame':    3724 obs. of  8 variables:
##  $ PC1       : num  -1.059 -0.674 -1.026 -1.154 -0.504 ...
##  $ PC2       : num  0.732 1.332 0.539 0.467 0.736 ...
##  $ PC3       : num  -0.222 0.537 -0.17 -0.453 -0.602 ...
##  $ PC4       : num  -0.451 -0.172 -0.703 -0.396 0.151 ...
##  $ PC5       : num  -1.72 -1.7 -2.15 -1.49 -1.37 ...
##  $ PC6       : num  0.766 1.284 0.621 0.244 1.314 ...
##  $ PC7       : num  -1.011 0.391 -0.849 -0.827 -0.641 ...
##  $ DanceClubs: num  0.594 0.417 0.48 0.544 0.389 ...
psych::describe(data3)
##            vars    n mean sd median trimmed  mad   min  max range  skew
## PC1           1 3724    0  1  -0.18   -0.08 1.10 -1.52 2.55  4.06  0.57
## PC2           2 3724    0  1   0.16    0.05 1.00 -2.76 1.84  4.60 -0.44
## PC3           3 3724    0  1   0.13    0.03 1.03 -2.40 2.95  5.36 -0.24
## PC4           4 3724    0  1  -0.04   -0.02 1.18 -2.38 2.72  5.10  0.10
## PC5           5 3724    0  1   0.04    0.06 0.95 -4.57 2.41  6.98 -0.81
## PC6           6 3724    0  1   0.07    0.02 0.99 -3.14 3.09  6.23 -0.20
## PC7           7 3724    0  1  -0.03   -0.01 1.07 -3.09 4.09  7.18  0.17
## DanceClubs    8 3724    0  1  -0.28   -0.25 0.26 -1.29 5.55  6.84  3.08
##            kurtosis   se
## PC1           -0.80 0.02
## PC2           -0.58 0.02
## PC3           -0.66 0.02
## PC4           -0.81 0.02
## PC5            1.57 0.02
## PC6            0.02 0.02
## PC7            0.01 0.02
## DanceClubs     8.95 0.02
# visualize distance matrix
distance <- get_dist(data3)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

# NUMBER OF CLUSTERS ###########################################################

# using NbClust()
  # method options: kmeans, ward.D2, average, median, centroid, mcquitty (also single & complete but Dr D doesn't recommend)
sample_data <- slice_sample(data3, prop = 0.1)
nb <- NbClust(sample_data, distance = "manhattan",
              min.nc = 2, max.nc = 15, method = "centroid")
## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 8 proposed 2 as the best number of clusters 
## * 2 proposed 3 as the best number of clusters 
## * 3 proposed 4 as the best number of clusters 
## * 8 proposed 5 as the best number of clusters 
## * 1 proposed 6 as the best number of clusters 
## * 1 proposed 11 as the best number of clusters 
## * 1 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
fviz_nbclust(nb)
## Warning in if (class(best_nc) == "numeric") print(best_nc) else if
## (class(best_nc) == : the condition has length > 1 and only the first element
## will be used
## Warning in if (class(best_nc) == "matrix") .viz_NbClust(x, print.summary, : the
## condition has length > 1 and only the first element will be used
## Warning in if (class(best_nc) == "numeric") print(best_nc) else if
## (class(best_nc) == : the condition has length > 1 and only the first element
## will be used
## Warning in if (class(best_nc) == "matrix") {: the condition has length > 1 and
## only the first element will be used
## Among all indices: 
## ===================
## * 2 proposed  0 as the best number of clusters
## * 8 proposed  2 as the best number of clusters
## * 2 proposed  3 as the best number of clusters
## * 3 proposed  4 as the best number of clusters
## * 8 proposed  5 as the best number of clusters
## * 1 proposed  6 as the best number of clusters
## * 1 proposed  11 as the best number of clusters
## * 1 proposed  15 as the best number of clusters
## 
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is  2 .

# using clValid()
  # method options: hierarchical, kmeans, diana, agnes, pam (plus others but idk what they are)
clmethods <- c("hierarchical", "kmeans", "pam", "clara",
               "diana", "agnes")
sample_data <- slice_sample(data3, n = 600)
intern <- clValid(sample_data, nClust = 3:8,
                  clMethods = clmethods, validation = "stability")
summary(intern)
## 
## Clustering Methods:
##  hierarchical kmeans pam clara diana agnes 
## 
## Cluster sizes:
##  3 4 5 6 7 8 
## 
## Validation Measures:
##                        3      4      5      6      7      8
##                                                            
## hierarchical APN  0.0405 0.0923 0.1055 0.0865 0.1325 0.1547
##              AD   3.6178 3.4932 3.4172 3.3203 3.2509 3.1052
##              ADM  0.4073 0.3187 0.5414 0.5650 0.5790 0.6564
##              FOM  0.9731 0.9583 0.9537 0.9482 0.9437 0.9323
## kmeans       APN  0.0971 0.1341 0.2895 0.4013 0.4542 0.4657
##              AD   3.4853 3.4534 3.3670 3.3440 3.2707 3.1580
##              ADM  0.3121 0.3902 0.8961 1.1643 1.1899 1.3552
##              FOM  0.9859 0.9775 0.9749 0.9704 0.9679 0.9621
## pam          APN  0.4080 0.3413 0.3603 0.3750 0.4197 0.3359
##              AD   3.6466 3.4751 3.2615 3.1381 3.0182 2.8041
##              ADM  1.1799 1.1450 1.0972 1.1605 1.2113 0.9560
##              FOM  0.9992 1.0000 0.9963 0.9786 0.9564 0.9389
## clara        APN  0.4105 0.4794 0.4815 0.4823 0.4506 0.4487
##              AD   3.6453 3.5891 3.4585 3.3059 3.1644 2.9896
##              ADM  1.0760 1.3443 1.4103 1.4153 1.3707 1.2469
##              FOM  0.9963 0.9881 0.9884 0.9779 0.9612 0.9392
## diana        APN  0.1176 0.2316 0.3220 0.3342 0.3512 0.3871
##              AD   3.4958 3.3959 3.3665 3.3084 3.2039 3.1053
##              ADM  0.3542 0.8408 0.9305 0.9499 1.1676 1.2007
##              FOM  0.9854 0.9695 0.9649 0.9613 0.9562 0.9505
## agnes        APN  0.0405 0.0923 0.1055 0.0865 0.1325 0.1547
##              AD   3.6178 3.4932 3.4172 3.3203 3.2509 3.1052
##              ADM  0.4073 0.3187 0.5414 0.5650 0.5790 0.6564
##              FOM  0.9731 0.9583 0.9537 0.9482 0.9437 0.9323
## 
## Optimal Scores:
## 
##     Score  Method       Clusters
## APN 0.0405 hierarchical 3       
## AD  2.8041 pam          8       
## ADM 0.3121 kmeans       3       
## FOM 0.9323 hierarchical 8
# using fviz_nbclust()
  # kmeans
sample_data <- slice_sample(data3, prop = 0.1)
fviz_nbclust(sample_data, kmeans, k.max = 20, method = "wss") +
  labs(subtitle = "Elbow method")   # 8 or 14?

sample_data <- slice_sample(data3, prop = 0.1)
fviz_nbclust(sample_data, kmeans, k.max = 20, method = "silhouette") +
  labs(subtitle = "Silhouette method")   # 9

sample_data <- slice_sample(data3, prop = 0.1)
fviz_nbclust(sample_data, kmeans, k.max = 15, method = "gap_stat", 
             nboot = 50, iter.max = 30) +   # 9
  labs(subtitle = "Gap statistic method")

  # pam, runs much slower than kmeans
sample_data <- slice_sample(data3, prop = 0.1)
fviz_nbclust(sample_data, pam, k.max = 20, method = "wss") +
  labs(subtitle = "Elbow method")   # 9?

sample_data <- slice_sample(data3, prop = 0.1)
fviz_nbclust(sample_data, pam, k.max = 20, method = "silhouette") +
  labs(subtitle = "Silhouette method")   # 9, 11

sample_data <- slice_sample(data3, prop = 0.1)
fviz_nbclust(sample_data, pam, k.max = 15, method = "gap_stat", 
             nboot = 50) +   # 11?
  labs(subtitle = "Gap statistic method")

# using clusGap()
sample_data <- slice_sample(data3, prop = 0.1)
gap_stat <- clusGap(sample_data, FUN = kmeans, nstart = 25,
                    K.max = 20, B = 25, iter.max = 30)

fviz_gap_stat(gap_stat, 
              maxSE = list(method = "globalSEmax", SE.factor = 1))   # gives k=18

print(gap_stat, method = "globalSEmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = sample_data, FUNcluster = kmeans, K.max = 20, B = 25,     nstart = 25, iter.max = 30)
## B=25 simulated reference sets, k = 1..20; spaceH0="scaledPCA"
##  --> Number of clusters (method 'globalSEmax', SE.factor=1): 18
##           logW   E.logW       gap      SE.sim
##  [1,] 5.870364 6.337838 0.4674733 0.008561558
##  [2,] 5.813991 6.252201 0.4382103 0.009121331
##  [3,] 5.736212 6.207101 0.4708888 0.008959112
##  [4,] 5.661468 6.169857 0.5083884 0.009345998
##  [5,] 5.596197 6.139139 0.5429424 0.009931472
##  [6,] 5.529999 6.111324 0.5813248 0.009817264
##  [7,] 5.465115 6.086674 0.6215591 0.009990850
##  [8,] 5.421040 6.064374 0.6433344 0.009760220
##  [9,] 5.368144 6.044257 0.6761137 0.010323699
## [10,] 5.332850 6.025158 0.6923074 0.010428350
## [11,] 5.297183 6.007438 0.7102550 0.009684115
## [12,] 5.266196 5.990505 0.7243098 0.009747336
## [13,] 5.239722 5.974481 0.7347595 0.009837895
## [14,] 5.212101 5.959934 0.7478328 0.009486505
## [15,] 5.179752 5.946055 0.7663034 0.010093245
## [16,] 5.160807 5.932719 0.7719127 0.010113628
## [17,] 5.140261 5.919091 0.7788293 0.010060472
## [18,] 5.110823 5.906116 0.7952931 0.010631857
## [19,] 5.094283 5.893793 0.7995097 0.010436817
## [20,] 5.078110 5.881818 0.8037080 0.010624455
# HIERARCHICAL CLUSTERING ######################################################

# using hclust()
sample_data <- slice_sample(data3, prop = 0.25)
hclus1 <- hclust(get_dist(sample_data, stand = T), method = "ward.D2")
hclus1
## 
## Call:
## hclust(d = get_dist(sample_data, stand = T), method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 931
# SLOOOOOOOOWWW
fviz_dend(hclus1, show_labels = F)

  # suggests 6-10, depending on seed

grp7 <- cutree(hclus1, k = 7)
table(grp7)
## grp7
##   1   2   3   4   5   6   7 
## 234  81 175  53 244  70  74
grp10 <- cutree(hclus1, k = 10)
table(grp10)
## grp10
##   1   2   3   4   5   6   7   8   9  10 
##  78  81 106  53  81  70 163 156  74  69
grp12 <- cutree(hclus1, k = 12)
table(grp12)
## grp12
##   1   2   3   4   5   6   7   8   9  10  11  12 
##  78  81 106  53  81  70 163 110  32  42  69  46
grp2 <- cutree(hclus1, k = 2)
table(grp2)
## grp2
##   1   2 
## 857  74
# dendrogram colored by group
fviz_dend(hclus1, k = 6,
          cex = 0.5,
          palette = "Set3",
          show_labels = FALSE,
          rect = TRUE
)

# visualize individuals on PC1 vs PC2, not very helpful
fviz_cluster(list(data = sample_data, cluster = grp2),
             axes = c(1,2),
             palette = "Set3",
             ellipse = T,
             ellipse.type = "convex", # Concentration ellipse
             geom = "point",
             show.clust.cent = FALSE, ggtheme = theme_minimal())

# using agnes()
sample_data <- slice_sample(data3, prop = 0.1)
agnes1 <- agnes(sample_data, stand = T,
                metric = "euclidean", # "euclidean" or "manhattan"
                method = "ward"       # try "average" or "ward"
                )
fviz_dend(agnes1, show_labels = F)

# agnes_ac <- data.frame(ac = 0.975)
# agnes_ac <- rbind(agnes_ac, agnes1$ac)

# agnes_ac <- cbind(agnes_ac, 
#                   method = c(rep("ward", 10), rep("average", 10)),
#                   metric = rep(c(rep("euclidean", 5), rep("manhattan", 5)), 2)
# )
# write.csv(agnes_ac, "agnes_ac.csv", row.names = F)




# using diana()
sample_data <- slice_sample(data3, prop = 0.1)
diana1 <- diana(sample_data, stand = T,
                metric = "euclidean"  # "euclidean" or "manhattan"
)
fviz_dend(diana1, show_labels = F)

# diana_dc <- data.frame(dc = diana1$dc)
# diana_dc <- rbind(diana_dc, diana1$dc)

# diana_dc <- cbind(diana_dc,
#                   metric = c(rep("euclidean", 5), rep("manhattan", 5))
# )
# write.csv(diana_dc, "diana_dc.csv", row.names = F)
# K-MEANS CLUSTERING ###########################################################

km3 <- kmeans(data3, centers = 3, nstart = 25)
km4 <- kmeans(data3, centers = 4, nstart = 25)
km5 <- kmeans(data3, centers = 5, nstart = 25)
km6 <- kmeans(data3, centers = 6, nstart = 25)
km7 <- kmeans(data3, centers = 7, nstart = 25)
km8 <- kmeans(data3, centers = 8, nstart = 25)
km9 <- kmeans(data3, centers = 9, nstart = 25)


# not helpful
fviz_cluster(km9, data = data3, geom = "point")

fviz_silhouette(silhouette(km3$cluster, dist(data3)))   # 0.15, no
##   cluster size ave.sil.width
## 1       1 1515          0.14
## 2       2  295          0.27
## 3       3 1914          0.13

fviz_silhouette(silhouette(km4$cluster, dist(data3)))   # 0.16, no
##   cluster size ave.sil.width
## 1       1  917          0.06
## 2       2 1087          0.24
## 3       3  283          0.27
## 4       4 1437          0.14

fviz_silhouette(silhouette(km5$cluster, dist(data3)))   # 0.18
##   cluster size ave.sil.width
## 1       1  276          0.24
## 2       2 1105          0.22
## 3       3  917          0.17
## 4       4  764          0.23
## 5       5  662          0.07

fviz_silhouette(silhouette(km6$cluster, dist(data3)))   # 0.21
##   cluster size ave.sil.width
## 1       1  869          0.25
## 2       2  387          0.23
## 3       3 1101          0.17
## 4       4  201          0.27
## 5       5  905          0.16
## 6       6  261          0.26

fviz_silhouette(silhouette(km7$cluster, dist(data3)))   # 0.22
##   cluster size ave.sil.width
## 1       1  745          0.21
## 2       2  695          0.18
## 3       3  654          0.28
## 4       4  204          0.26
## 5       5  476          0.17
## 6       6  263          0.23
## 7       7  687          0.23

fviz_silhouette(silhouette(km8$cluster, dist(data3)))   # 0.25
##   cluster size ave.sil.width
## 1       1  587          0.26
## 2       2  263          0.23
## 3       3  343          0.49
## 4       4  618          0.18
## 5       5  198          0.25
## 6       6  820          0.17
## 7       7  582          0.27
## 8       8  313          0.27

fviz_silhouette(silhouette(km9$cluster, dist(data3)))   # 0.26
##   cluster size ave.sil.width
## 1       1  248          0.31
## 2       2  537          0.28
## 3       3  511          0.21
## 4       4  698          0.20
## 5       5  375          0.16
## 6       6  175          0.27
## 7       7  262          0.21
## 8       8  326          0.52
## 9       9  592          0.27

km_ss <- data.frame(
  k = seq(3, 9, 1),
  tot.withinss = c(
    km3$tot.withinss,
    km4$tot.withinss,
    km5$tot.withinss,
    km6$tot.withinss,
    km7$tot.withinss,
    km8$tot.withinss,
    km9$tot.withinss
  ),
  betweenss = c(
    km3$betweenss,
    km4$betweenss,
    km5$betweenss,
    km6$betweenss,
    km7$betweenss,
    km8$betweenss,
    km9$betweenss
  )
)

# write.csv(km_ss, "km_ss.csv", row.names = F)


km_ss %>% mutate(ratio = betweenss/tot.withinss) %>% 
  
  ggplot() + 
  geom_line(aes(k, betweenss), color="blue") + geom_point(aes(k, betweenss)) +
  geom_line(aes(k, tot.withinss), color="red") + geom_point(aes(k, tot.withinss))

  # geom_line(aes(k, ratio)) + geom_point(aes(k, ratio))
# K-MEDIOIDS CLUSTERING ########################################################

kmed9e <- pam(data3, 9, metric = "euclidean")
kmed9m <- pam(data3, 9, metric = "manhattan")
kmed8e <- pam(data3, 8, metric = "euclidean")
kmed8m <- pam(data3, 8, metric = "manhattan")



fviz_silhouette(silhouette(kmed9e, distance))   # 0.25
##   cluster size ave.sil.width
## 1       1  545          0.29
## 2       2  664          0.19
## 3       3  454          0.11
## 4       4  568          0.26
## 5       5  496          0.21
## 6       6  326          0.52
## 7       7  235          0.24
## 8       8  178          0.27
## 9       9  258          0.28

fviz_silhouette(silhouette(kmed9m, distance))   # 0.23
##   cluster size ave.sil.width
## 1       1  525          0.30
## 2       2  625          0.11
## 3       3  453          0.14
## 4       4  286          0.21
## 5       5  540          0.26
## 6       6  483          0.20
## 7       7  342          0.42
## 8       8  194          0.26
## 9       9  276          0.26

fviz_silhouette(silhouette(kmed8e, distance))   # 0.23
##   cluster size ave.sil.width
## 1       1  592          0.23
## 2       2  937          0.12
## 3       3  535          0.29
## 4       4  513          0.22
## 5       5  334          0.50
## 6       6  361          0.20
## 7       7  246          0.25
## 8       8  206          0.24

fviz_silhouette(silhouette(kmed8m, distance))   # 0.23
##   cluster size ave.sil.width
## 1       1  520          0.30
## 2       2  538          0.19
## 3       3  639          0.05
## 4       4  583          0.28
## 5       5  554          0.18
## 6       6  365          0.45
## 7       7  204          0.27
## 8       8  321          0.24

# FINAL (???) MODEL ############################################################

set.seed(7852)
km <- kmeans(data3, centers = 8, nstart = 25)

clus <- cbind(complete, cluster = km$cluster)
head(clus)




# EVALUATING THE MODEL

# comparing medians
clus_medians <- aggregate(x = clus[, 2:25],
                          by = list(clus$cluster),
                          FUN = median) %>% 
  pivot_longer(cols = 2:25, values_to = "median") %>% 
  rename(cluster = Group.1) %>% 
  mutate(cluster = as_factor(cluster))


ggplot(clus_medians, aes(cluster, median, fill = cluster)) + 
  ylim(0,5) + facet_wrap(.~name) + theme_bw() +
  geom_col() + 
  labs(title = "Comparing Clusters by Median Scores for k=8",
       subtitle = "DanceClubs included in PCA")



# comparing means
clus_means <- aggregate(x = clus[, 2:25],
                        by = list(clus$cluster),
                        FUN = mean) %>% 
  pivot_longer(cols = 2:25, values_to = "mean") %>% 
  rename(cluster = Group.1) %>% 
  mutate(cluster = as_factor(cluster))


ggplot(clus_means, aes(cluster, mean, fill = cluster)) + 
  ylim(0,5) + facet_wrap(.~name) + theme_bw() +
  geom_col()