# NK Project 3 Cluster Analysis
# Part II -
# Evaluating clusters
# PREP #########################################################################
# Packages
pacman::p_load(
pacman, rio, tidyverse, magrittr, janitor, # general stuff
psych, # EDA
visdat, # missingness
data.table, # working with data.tables
corrplot, # correlation plot
FactoMineR, # EDA, PCA, MFA
factoextra, # extract and visualize PCA/MFA
nFactors, # how many factors/components to retain
cluster, # clustering algorithms
NbClust, # number of clusters
clValid # ?
)
# Data
data <- import("data/Travel_Review.xlsx", range = "A1:Y5457",
na = "0")
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in L2714 / R2714C12: got '2 2.'
# clean variable names
data %<>% rename(SwimmingPools = `Swimming Pools`)
# drop missings
complete <- data[complete.cases(data),]
# standardize by user
data2 <- complete
data2[,-1] <- t(data2[,-1]) %>% scale() %>% t()
# PCA WITHOUT DANCE CLUB
# X <- subset(data2, select = -c(UserID, DanceClubs))
# pca <- prcomp(X, center = TRUE, scale. = TRUE)
# pca_label <- cbind(data2, as.data.frame(pca$x))
#
# data3 <- pca_label %>%
# dplyr::select(PC1:PC7, DanceClubs) %>%
# scale() %>% as.data.frame()
# PCA WITH DANCE CLUB
# X <- subset(data2, select = -c(UserID))
# pca <- prcomp(X, center = TRUE, scale. = TRUE)
# pca_label <- cbind(data2, as.data.frame(pca$x))
#
# data3 <- pca_label %>%
# dplyr::select(PC1:PC7) %>%
# scale() %>% as.data.frame()
# PCA - playing with excluding vars
X <- subset(data2, select = -c(UserID
# , DanceClubs
, ArtGalleries
, LocalServices
# , BeautySpas
))
pca <- prcomp(X, center = TRUE, scale. = TRUE)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.2072 1.6229 1.3167 1.21797 1.17004 1.10864 1.01080
## Proportion of Variance 0.2215 0.1197 0.0788 0.06743 0.06223 0.05587 0.04644
## Cumulative Proportion 0.2215 0.3412 0.4200 0.48739 0.54962 0.60549 0.65193
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.90481 0.87854 0.85638 0.83191 0.78579 0.75242 0.73727
## Proportion of Variance 0.03721 0.03508 0.03334 0.03146 0.02807 0.02573 0.02471
## Cumulative Proportion 0.68914 0.72423 0.75756 0.78902 0.81709 0.84282 0.86753
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.71737 0.68842 0.65040 0.63999 0.60384 0.57472 0.56456
## Proportion of Variance 0.02339 0.02154 0.01923 0.01862 0.01657 0.01501 0.01449
## Cumulative Proportion 0.89092 0.91246 0.93169 0.95031 0.96688 0.98189 0.99638
## PC22
## Standard deviation 0.28211
## Proportion of Variance 0.00362
## Cumulative Proportion 1.00000
pca_label <- cbind(data2, as.data.frame(pca$x))
data3 <- pca_label %>%
dplyr::select(PC1:PC9) %>%
scale() %>% as.data.frame()
# MODEL ########################################################################
set.seed(7852)
km <- kmeans(data3, centers = 9, nstart = 25)
# km <- pam(data3, 9, metric = "euclidean")
clus <- cbind(data2, cluster = km$cluster)
# agnes <- agnes(data3, stand = T,
# metric = "manhattan", # "euclidean" or "manhattan"
# method = "ward"
# )
# clus <- cbind(data2, cluster = cutree(agnes, k = 8))
title <- "k=9 - kMeans"
subtitle <- "experimenting with excluding vars in PCA, user-standardized data"
# EVALUATING THE MODEL #########################################################
# comparing medians
clus_medians <- aggregate(x = clus[, 2:25],
by = list(clus$cluster),
FUN = median) %>%
pivot_longer(cols = 2:25, values_to = "median") %>%
rename(cluster = Group.1) %>%
mutate(cluster = as_factor(cluster))
clus_melt <- clus[, 2:26] %>%
pivot_longer(cols = 1:24, values_to = "rating") %>%
mutate(cluster = as_factor(cluster))
clus_sas <- clus_melt %>%
group_by(cluster, name) %>%
summarise(median = median(rating),
ntile25 = quantile(rating, probs = 0.25),
ntile75 = quantile(rating, probs = 0.75),
.groups = "drop")
ggplot(clus_medians, aes(cluster, median, fill = cluster)) +
# ylim(0,5) +
facet_wrap(.~name) + theme_bw() +
geom_col() +
labs(title = paste0("Comparing Clusters by Median Scores for ", title),
subtitle = subtitle)

# this one is kinda chaos
# clus_melt %>%
clus_melt[!clus_melt$name %in% c("ArtGalleries"),] %>%
# clus_melt[clus_melt$cluster %in% c(2,3),] %>%
ggplot(aes(cluster, rating, fill = cluster)) +
facet_wrap(.~name) + theme_bw() +
# ylim(0,5) +
geom_boxplot(outlier.shape = NA, coef=0) +
labs(title = paste0("Comparing Clusters by Boxplot for ", title),
subtitle = subtitle)

clus_sas %>%
# filter(!name %in% c("ArtGalleries")) %>%
ggplot() + facet_wrap(.~name) + theme_bw() +
geom_point(aes(x=cluster, y=median, color=cluster), size = 2.5) +
geom_segment(aes(x=cluster, xend=cluster,
y=ntile25, yend=ntile75,
color=cluster), lwd=0.8) +
labs(title = paste0("Comparing Clusters for ", title),
subtitle = subtitle, caption = "Dot = median, line = IQR")

clus %>% #filter(cluster==2) %>%
ggplot(aes(DanceClubs, Pubs_Bars, color = factor(cluster))) + geom_point()

data3 %>%
ggplot(aes(PC1, PC2, color = factor(km$cluster))) + geom_point()

fviz_cluster(km, data = data3, geom = "point")

# avg cluster ratings
clus_medians %>% group_by(name) %>%
summarise(avg_median_rating = mean(median)) %>%
arrange(desc(avg_median_rating))
## # A tibble: 24 x 2
## name avg_median_rating
## <chr> <dbl>
## 1 Malls 0.502
## 2 Restaurants 0.345
## 3 Parks 0.324
## 4 Theatres 0.322
## 5 Museums 0.273
## 6 Beaches 0.183
## 7 Pubs_Bars 0.156
## 8 Resorts 0.151
## 9 Zoo -0.0297
## 10 ViewPoints -0.0629
## # ... with 14 more rows
# cluster sizes
clus %>% group_by(cluster) %>%
tally()
## # A tibble: 9 x 2
## cluster n
## <int> <int>
## 1 1 743
## 2 2 428
## 3 3 596
## 4 4 283
## 5 5 325
## 6 6 575
## 7 7 239
## 8 8 178
## 9 9 357