library(DT)
d <- read.csv('https://stats.dip.jp/01_ds/data/Mall_Customers.csv')
colnames(d) <- c('id', 'gender', 'age', 'income', 'score')
datatable(d, options = list(pageLength = 5))
NGROUPS <- 2
COL <- rainbow(NGROUPS)
matplot(x = d$income, y = d$score, pch = 16, type = 'p', col = COL[1])
grid()

as.numeric(as.factor((d$gender)))
## [1] 2 2 1 1 1 1 1 1 2 1 2 1 1 1 2 2 1 2 2 1 2 2 1 2 1 2 1 2 1 1 2 1 2 2 1 1 1
## [38] 1 1 1 1 2 2 1 1 1 1 1 1 1 1 2 1 2 1 2 1 2 1 2 2 2 1 1 2 2 1 1 2 1 2 1 1 1
## [75] 2 2 1 2 1 1 2 2 2 1 1 2 1 1 1 1 1 2 2 1 1 2 1 1 2 2 1 1 2 2 2 1 1 2 2 2 2
## [112] 1 1 2 1 1 1 1 1 1 2 1 1 2 1 1 2 2 2 2 2 2 1 1 2 1 1 2 2 1 1 2 1 1 2 2 2 1
## [149] 1 2 2 2 1 1 1 1 2 1 2 1 1 1 2 1 2 1 2 1 1 2 2 2 2 2 1 1 2 2 2 2 1 1 2 1 1
## [186] 2 1 2 1 1 1 1 2 1 1 1 1 2 2 2
library(cluster)
d_clust <- d[, c('income', 'score')]
hc.a <- agnes(d_clust)
gr <- cutree(as.hclust(hc.a), k = 5)
COL <- c("red", "green", "blue", "yellow", "purple")
a <- vector("list", length(unique(gr)))
for (i in 1:length(unique(gr))) {
a[[i]] <- d[gr == i, ]
}
matplot(x = d$income, y = d$score, pch = 16, type = 'n', col = COL[1])
for (i in 1:length(unique(gr))) {
matpoints(x = a[[i]]$income, y = a[[i]]$score, pch = 16, type = 'p', col = COL[i])
}
grid()

colnames(d)
## [1] "id" "gender" "age" "income" "score"
numeric_columns <- d[, sapply(d, is.numeric)]
pairs(numeric_columns, pch = 15 + as.numeric(as.factor(d$gender)),
col = COL[as.numeric(as.factor(d$gender))],
lower.panel = NULL,
oma = c(3, 3, 5, 3),
main = 'Mall Customers Data')
par(xpd = TRUE)
legend('bottomleft',
col = COL[1:3],
pch = 16:18,
legend = unique(d$gender))
library(cluster)
library(factoextra)
## 要求されたパッケージ ggplot2 をロード中です
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

d_clust <- d[, c('income', 'score')]
hc.a <- agnes(d_clust)
fviz_dend(as.hclust(hc.a), k = 3, horiz = T, rect = T, rect_fill = T,
color_labels_by_k = F, rect_border = 'jco', k_colors = 'jco', cex = 0.4)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

d_clust <- d[, c('income', 'score')]
hc.d <- diana(d_clust)
fviz_dend(as.hclust(hc.d), k = 3, horiz = TRUE, rect = TRUE, rect_fill = TRUE,
color_labels_by_k = FALSE, rect_border = 'jco', k_colors = 'jco', cex = 0.4)

METHOD <- c('single', 'complete', 'average',
'weighted', 'ward', 'gaverage', 'flexible')
ac <- rep(NA, 7)
for (i in 1:6) ac[i] <- agnes(d, method = METHOD[i])$ac
ac[7] <- agnes(d, method = METHOD[7], par.method = 0.5)$ac
names(ac) <- METHOD
barplot(ac, ylim = c(0.8, 1.0), xpd = F)
abline(h = seq(0, 1, 0.05), lty = 3)

fviz_nbclust(d_clust, FUNcluster = hcut, method = 'wss')

fviz_nbclust(d_clust, FUNcluster = hcut, method = 'gap_stat')

fviz_nbclust(d_clust, FUNcluster = hcut, method = 'silhouette')

COL <- rainbow(NGROUPS)
matplot(x = d$income, y = d$score, pch = 16, type = 'p', col = COL[1])
grid()

library(ggplot2)
NGROUPS <- 5
set.seed(123)
km <- kmeans(d[, c('income', 'score')], centers = NGROUPS)
d$cluster <- as.factor(km$cluster)
ggplot(d, aes(x = income, y = score, color = cluster)) +
geom_point(size = 3) +
scale_color_manual(values = rainbow(NGROUPS)) +
labs(title = "K-means Clustering of Mall Customers",
x = "Income",
y = "Score") +
theme_minimal() +
theme(legend.position = "top")

library(ggplot2)
NGROUPS <- 5
set.seed(123)
km <- kmeans(d[, c('income', 'score')], centers = NGROUPS)
d$cluster <- as.factor(km$cluster)
ggplot(d, aes(x = income, y = score, color = cluster)) +
geom_point(size = 3) +
scale_color_manual(values = rainbow(NGROUPS)) +
labs(title = "K-means Clustering of Mall Customers",
x = "Income",
y = "Score") +
theme_minimal() +
theme(legend.position = "top")

library(factoextra)
fviz_cluster(km, data = d[, c('income', 'score')]) +
labs(title = "Cluster Visualization using factoextra")

library(cluster)
pm <- pam(d[, c('income', 'score')], k = NGROUPS)
plot(pm, main = "PAM Clustering of Mall Customers")


cl2 <- clara(d[, c('income', 'score')], k = NGROUPS, pamLike = FALSE, samples = 50)
plot(cl2, main = "CLARA Clustering of Mall Customers")

