機械学習

library(DT)
d <- read.csv('https://stats.dip.jp/01_ds/data/Mall_Customers.csv')

colnames(d) <- c('id', 'gender', 'age', 'income', 'score')

datatable(d, options = list(pageLength = 5))

NGROUPS <- 2
COL <- rainbow(NGROUPS)

matplot(x = d$income, y = d$score, pch = 16, type = 'p', col = COL[1])
grid()

as.numeric(as.factor((d$gender)))

##   [1] 2 2 1 1 1 1 1 1 2 1 2 1 1 1 2 2 1 2 2 1 2 2 1 2 1 2 1 2 1 1 2 1 2 2 1 1 1
##  [38] 1 1 1 1 2 2 1 1 1 1 1 1 1 1 2 1 2 1 2 1 2 1 2 2 2 1 1 2 2 1 1 2 1 2 1 1 1
##  [75] 2 2 1 2 1 1 2 2 2 1 1 2 1 1 1 1 1 2 2 1 1 2 1 1 2 2 1 1 2 2 2 1 1 2 2 2 2
## [112] 1 1 2 1 1 1 1 1 1 2 1 1 2 1 1 2 2 2 2 2 2 1 1 2 1 1 2 2 1 1 2 1 1 2 2 2 1
## [149] 1 2 2 2 1 1 1 1 2 1 2 1 1 1 2 1 2 1 2 1 1 2 2 2 2 2 1 1 2 2 2 2 1 1 2 1 1
## [186] 2 1 2 1 1 1 1 2 1 1 1 1 2 2 2

library(cluster)

d_clust <- d[, c('income', 'score')]
hc.a <- agnes(d_clust)
gr <- cutree(as.hclust(hc.a), k = 5)  

COL <- c("red", "green", "blue", "yellow", "purple")

a <- vector("list", length(unique(gr))) 
for (i in 1:length(unique(gr))) {
  a[[i]] <- d[gr == i, ]  
}

matplot(x = d$income, y = d$score, pch = 16, type = 'n', col = COL[1])

for (i in 1:length(unique(gr))) {
  matpoints(x = a[[i]]$income, y = a[[i]]$score, pch = 16, type = 'p', col = COL[i])
}

grid()

colnames(d)

## [1] "id"     "gender" "age"    "income" "score"

numeric_columns <- d[, sapply(d, is.numeric)]

pairs(numeric_columns, pch = 15 + as.numeric(as.factor(d$gender)),
      col = COL[as.numeric(as.factor(d$gender))],
      lower.panel = NULL, 
      oma = c(3, 3, 5, 3), 
      main = 'Mall Customers Data')

par(xpd = TRUE)
legend('bottomleft', 
       col = COL[1:3], 
       pch = 16:18, 
       legend = unique(d$gender))
library(cluster)
library(factoextra)

##  要求されたパッケージ ggplot2 をロード中です

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

d_clust <- d[, c('income', 'score')]

hc.a <- agnes(d_clust)
fviz_dend(as.hclust(hc.a), k = 3, horiz = T, rect = T, rect_fill = T,
 color_labels_by_k = F, rect_border = 'jco', k_colors = 'jco', cex = 0.4)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

d_clust <- d[, c('income', 'score')]

hc.d <- diana(d_clust)

fviz_dend(as.hclust(hc.d), k = 3, horiz = TRUE, rect = TRUE, rect_fill = TRUE,
          color_labels_by_k = FALSE, rect_border = 'jco', k_colors = 'jco', cex = 0.4)

METHOD <- c('single', 'complete', 'average',
            'weighted', 'ward', 'gaverage', 'flexible')

ac <- rep(NA, 7)
for (i in 1:6) ac[i] <- agnes(d, method = METHOD[i])$ac
ac[7] <- agnes(d, method = METHOD[7], par.method = 0.5)$ac
names(ac) <- METHOD

barplot(ac, ylim = c(0.8, 1.0), xpd = F)
abline(h = seq(0, 1, 0.05), lty = 3)

fviz_nbclust(d_clust, FUNcluster = hcut, method = 'wss')

fviz_nbclust(d_clust, FUNcluster = hcut, method = 'gap_stat')

fviz_nbclust(d_clust, FUNcluster = hcut, method = 'silhouette')

COL <- rainbow(NGROUPS)

matplot(x = d$income, y = d$score, pch = 16, type = 'p', col = COL[1])
grid()

library(ggplot2)
NGROUPS <- 5

set.seed(123)  
km <- kmeans(d[, c('income', 'score')], centers = NGROUPS)


d$cluster <- as.factor(km$cluster)

ggplot(d, aes(x = income, y = score, color = cluster)) +
  geom_point(size = 3) +
  scale_color_manual(values = rainbow(NGROUPS)) +
  labs(title = "K-means Clustering of Mall Customers",
       x = "Income",
       y = "Score") +
  theme_minimal() +
  theme(legend.position = "top")

library(ggplot2)
NGROUPS <- 5

set.seed(123)  
km <- kmeans(d[, c('income', 'score')], centers = NGROUPS)


d$cluster <- as.factor(km$cluster)

ggplot(d, aes(x = income, y = score, color = cluster)) +
  geom_point(size = 3) +
  scale_color_manual(values = rainbow(NGROUPS)) +
  labs(title = "K-means Clustering of Mall Customers",
       x = "Income",
       y = "Score") +
  theme_minimal() +
  theme(legend.position = "top")

library(factoextra)
fviz_cluster(km, data = d[, c('income', 'score')]) +
  labs(title = "Cluster Visualization using factoextra")

library(cluster)
pm <- pam(d[, c('income', 'score')], k = NGROUPS)
plot(pm, main = "PAM Clustering of Mall Customers")

cl2 <- clara(d[, c('income', 'score')], k = NGROUPS, pamLike = FALSE, samples = 50)

plot(cl2, main = "CLARA Clustering of Mall Customers")

機械学習

23150161

2024-10-04