# NK Project 3 Cluster Analysis
# Part II - 
  # Evaluating clusters



# PREP #########################################################################
# Packages
pacman::p_load(
  pacman, rio, tidyverse, magrittr, janitor,  # general stuff
  psych,        # EDA
  visdat,       # missingness
  data.table,   # working with data.tables
  corrplot,     # correlation plot
  FactoMineR,   # EDA, PCA, MFA
  factoextra,   # extract and visualize PCA/MFA
  nFactors,     # how many factors/components to retain
  cluster,      # clustering algorithms
  NbClust,      # number of clusters
  clValid       # ?
)


# Data
data <- import("data/Travel_Review.xlsx", range = "A1:Y5457",
               na = "0")
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in L2714 / R2714C12: got '2 2.'
# clean variable names
data %<>% rename(SwimmingPools = `Swimming Pools`)
# drop missings
complete <- data[complete.cases(data),]
# standardize by user
data2 <- complete
data2[,-1] <- t(data2[,-1]) %>% scale() %>% t()

# PCA WITHOUT DANCE CLUB
# X <- subset(data2, select = -c(UserID, DanceClubs))
# pca <- prcomp(X, center = TRUE, scale. = TRUE)
# pca_label <- cbind(data2, as.data.frame(pca$x))
# 
# data3 <- pca_label %>% 
#   dplyr::select(PC1:PC7, DanceClubs) %>%
#   scale() %>% as.data.frame()


# PCA WITH DANCE CLUB
# X <- subset(data2, select = -c(UserID))
# pca <- prcomp(X, center = TRUE, scale. = TRUE)
# pca_label <- cbind(data2, as.data.frame(pca$x))
# 
# data3 <- pca_label %>% 
#   dplyr::select(PC1:PC7) %>%
#   scale() %>% as.data.frame()


# PCA - playing with excluding vars
X <- subset(data2, select = -c(UserID
                               # , DanceClubs
                               , ArtGalleries
                               , LocalServices
                               # , BeautySpas
                               ))
pca <- prcomp(X, center = TRUE, scale. = TRUE)
summary(pca)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.2072 1.6229 1.3167 1.21797 1.17004 1.10864 1.01080
## Proportion of Variance 0.2215 0.1197 0.0788 0.06743 0.06223 0.05587 0.04644
## Cumulative Proportion  0.2215 0.3412 0.4200 0.48739 0.54962 0.60549 0.65193
##                            PC8     PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.90481 0.87854 0.85638 0.83191 0.78579 0.75242 0.73727
## Proportion of Variance 0.03721 0.03508 0.03334 0.03146 0.02807 0.02573 0.02471
## Cumulative Proportion  0.68914 0.72423 0.75756 0.78902 0.81709 0.84282 0.86753
##                           PC15    PC16    PC17    PC18    PC19    PC20    PC21
## Standard deviation     0.71737 0.68842 0.65040 0.63999 0.60384 0.57472 0.56456
## Proportion of Variance 0.02339 0.02154 0.01923 0.01862 0.01657 0.01501 0.01449
## Cumulative Proportion  0.89092 0.91246 0.93169 0.95031 0.96688 0.98189 0.99638
##                           PC22
## Standard deviation     0.28211
## Proportion of Variance 0.00362
## Cumulative Proportion  1.00000
pca_label <- cbind(data2, as.data.frame(pca$x))

data3 <- pca_label %>% 
  dplyr::select(PC1:PC9) %>%
  scale() %>% as.data.frame()
# MODEL ########################################################################

set.seed(7852)
km <- kmeans(data3, centers = 9, nstart = 25)
# km <- pam(data3, 9, metric = "euclidean")

clus <- cbind(data2, cluster = km$cluster)


# agnes <- agnes(data3, stand = T,
#                metric = "manhattan", # "euclidean" or "manhattan"
#                method = "ward"
# )
# clus <- cbind(data2, cluster = cutree(agnes, k = 8))


title <- "k=9 - kMeans"
subtitle <- "experimenting with excluding vars in PCA, user-standardized data"
# EVALUATING THE MODEL #########################################################

# comparing medians
clus_medians <- aggregate(x = clus[, 2:25],
                          by = list(clus$cluster),
                          FUN = median) %>% 
  pivot_longer(cols = 2:25, values_to = "median") %>% 
  rename(cluster = Group.1) %>% 
  mutate(cluster = as_factor(cluster))

clus_melt <- clus[, 2:26] %>% 
  pivot_longer(cols = 1:24, values_to = "rating") %>% 
  mutate(cluster = as_factor(cluster))

clus_sas <- clus_melt %>% 
  group_by(cluster, name) %>% 
  summarise(median = median(rating),
            ntile25 = quantile(rating, probs = 0.25),
            ntile75 = quantile(rating, probs = 0.75),
            .groups = "drop")
ggplot(clus_medians, aes(cluster, median, fill = cluster)) + 
  # ylim(0,5) + 
  facet_wrap(.~name) + theme_bw() +
  geom_col() + 
  labs(title = paste0("Comparing Clusters by Median Scores for ", title),
       subtitle = subtitle)

# this one is kinda chaos
# clus_melt %>% 
clus_melt[!clus_melt$name %in% c("ArtGalleries"),] %>%
# clus_melt[clus_melt$cluster %in% c(2,3),] %>% 
  ggplot(aes(cluster, rating, fill = cluster)) + 
  facet_wrap(.~name) + theme_bw() +
  # ylim(0,5) + 
  geom_boxplot(outlier.shape = NA, coef=0) + 
  labs(title = paste0("Comparing Clusters by Boxplot for ", title),
       subtitle = subtitle)

clus_sas %>% 
  # filter(!name %in% c("ArtGalleries")) %>%
  ggplot() + facet_wrap(.~name) + theme_bw() +
  geom_point(aes(x=cluster, y=median, color=cluster), size = 2.5) + 
  geom_segment(aes(x=cluster, xend=cluster,
                   y=ntile25, yend=ntile75,
                   color=cluster), lwd=0.8) + 
  labs(title = paste0("Comparing Clusters for ", title),
       subtitle = subtitle, caption = "Dot = median, line = IQR")

clus %>% #filter(cluster==2) %>%  
  ggplot(aes(DanceClubs, Pubs_Bars, color = factor(cluster))) + geom_point() 

data3 %>% 
  ggplot(aes(PC1, PC2, color = factor(km$cluster))) + geom_point() 

fviz_cluster(km, data = data3, geom = "point")

# avg cluster ratings
clus_medians %>% group_by(name) %>% 
  summarise(avg_median_rating = mean(median)) %>% 
  arrange(desc(avg_median_rating))
## # A tibble: 24 x 2
##    name        avg_median_rating
##    <chr>                   <dbl>
##  1 Malls                  0.502 
##  2 Restaurants            0.345 
##  3 Parks                  0.324 
##  4 Theatres               0.322 
##  5 Museums                0.273 
##  6 Beaches                0.183 
##  7 Pubs_Bars              0.156 
##  8 Resorts                0.151 
##  9 Zoo                   -0.0297
## 10 ViewPoints            -0.0629
## # ... with 14 more rows
# cluster sizes
clus %>% group_by(cluster) %>% 
  tally()
## # A tibble: 9 x 2
##   cluster     n
##     <int> <int>
## 1       1   743
## 2       2   428
## 3       3   596
## 4       4   283
## 5       5   325
## 6       6   575
## 7       7   239
## 8       8   178
## 9       9   357