機械学習

library(DT)

d <- read.csv('https://stats.dip.jp/01_ds/data/Mall_Customers.csv')

colnames(d) <- c('id', 'gender', 'age', 'income', 'score')

datatable(d, options = list(pageLength = 5))

NGROUPS <- 2

COL <- rainbow(NGROUPS)

matplot(x = d$income, y = d$score, pch = 16, type = 'p', col = COL[1])
grid()

COL <- c(rgb(255,   0,   0,  105, max = 255), 
         rgb(  0,   0, 255,  105, max = 255),  
         rgb(  0, 155,   0,  105, max = 255), 
         rgb(100, 100, 100,   55, max = 255))

library(cluster)
library(factoextra)

##  要求されたパッケージ ggplot2 をロード中です

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

hc.a <- agnes(d)
fviz_dend(as.hclust(hc.a), k = 3, horiz = T, rect = T, rect_fill = T,
 color_labels_by_k = F, rect_border = 'jco', k_colors = 'jco', cex = 0.4)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

gr <- cutree(hc.a, k = 3) 

head(gr)

## [1] 1 1 1 1 1 1

hc.d <- diana(d)
fviz_dend(as.hclust(hc.d), k = 3, horiz = T, rect = T, rect_fill = T,
 color_labels_by_k = F, rect_border = 'jco', k_colors = 'jco', cex = 0.4)

library(plotly)

## 
##  次のパッケージを付け加えます: 'plotly'

##  以下のオブジェクトは 'package:ggplot2' からマスクされています:
## 
##     last_plot

##  以下のオブジェクトは 'package:stats' からマスクされています:
## 
##     filter

##  以下のオブジェクトは 'package:graphics' からマスクされています:
## 
##     layout

library(ggplot2)
library(cluster)
library(ggdendro)
hc.a |> as.dendrogram() |> ggdendrogram(rotate = T) |> ggplotly()

hc.d |> as.dendrogram() |> ggdendrogram(rotate = T) |> ggplotly()

METHOD <- c('single', 'complete', 'average',
            'weighted', 'ward', 'gaverage', 'flexible')

ac <- rep(NA, 7)
for (i in 1:6) ac[i] <- agnes(d, method = METHOD[i])$ac
ac[7] <- agnes(d, method = METHOD[7], par.method = 0.5)$ac
names(ac) <- METHOD

barplot(ac, ylim = c(0.8, 1.0), xpd = F)
abline(h = seq(0, 1, 0.05), lty = 3)

fviz_nbclust(d[, -5], FUNcluster = hcut, method = 'wss')

## Warning in stats::dist(x): 強制変換により NA が生成されました

## Warning in stats::dist(x, method = method, ...): 強制変換により NA
## が生成されました

## Warning in stats::dist(x, method = method, ...): 強制変換により NA
## が生成されました

## Warning in stats::dist(x, method = method, ...): 強制変換により NA
## が生成されました

## Warning in stats::dist(x, method = method, ...): 強制変換により NA
## が生成されました

## Warning in stats::dist(x, method = method, ...): 強制変換により NA
## が生成されました

## Warning in stats::dist(x, method = method, ...): 強制変換により NA
## が生成されました

## Warning in stats::dist(x, method = method, ...): 強制変換により NA
## が生成されました

## Warning in stats::dist(x, method = method, ...): 強制変換により NA
## が生成されました

## Warning in stats::dist(x, method = method, ...): 強制変換により NA
## が生成されました

## Warning in stats::dist(x, method = method, ...): 強制変換により NA
## が生成されました

as.numeric(as.factor(d$gender))

##   [1] 2 2 1 1 1 1 1 1 2 1 2 1 1 1 2 2 1 2 2 1 2 2 1 2 1 2 1 2 1 1 2 1 2 2 1 1 1
##  [38] 1 1 1 1 2 2 1 1 1 1 1 1 1 1 2 1 2 1 2 1 2 1 2 2 2 1 1 2 2 1 1 2 1 2 1 1 1
##  [75] 2 2 1 2 1 1 2 2 2 1 1 2 1 1 1 1 1 2 2 1 1 2 1 1 2 2 1 1 2 2 2 1 1 2 2 2 2
## [112] 1 1 2 1 1 1 1 1 1 2 1 1 2 1 1 2 2 2 2 2 2 1 1 2 1 1 2 2 1 1 2 1 1 2 2 2 1
## [149] 1 2 2 2 1 1 1 1 2 1 2 1 1 1 2 1 2 1 2 1 1 2 2 2 2 2 1 1 2 2 2 2 1 1 2 1 1
## [186] 2 1 2 1 1 1 1 2 1 1 1 1 2 2 2

NGROUPS <- 4 
gr <- cutree(hc.a, k = NGROUPS) 
a <- vector('list', NGROUPS)
for (i in 1:NGROUPS) a[[i]] <- d[gr == i, ]

matplot(x = d$income, y = d$score, pch = 16, type = 'n', col = COL[1])
for  (i in 1:NGROUPS)
{
  matpoints(x = a[[i]]$income,
           y = a[[i]]$score, pch = 16, type = 'p', col = COL[i])}
grid()

pairs(d[, c('age', 'income', 'score')],
      col = 3 + as.numeric(as.factor(d$gender)),
      pch = 15 + as.numeric(as.factor(d$gender)),
      lower.panel = NULL, oma = c(3, 3, 5, 3),
      main = 'ショッピングモール　顧客データ' )

par(xpd = T)
legend('bottomleft', col = 4:5, pch = 16:17, legend = unique(d$gender))

gr <- cutree(hc.a, k = NGROUPS)

a <- vector('list', NGROUPS)
for (i in 1:NGROUPS) a[[i]] <- d[gr == i,]

matplot(x = d$income, y = d$score, pch = 16, type = 'n', col = COL[1]) 
for (i in 1:NGROUPS)
{matpoints(x = a[[i]]$income,
           y = a[[i]]$score, pch =16,type = 'p', col = COL[i])}
grid()

d <- read.csv('https://stats.dip.jp/01_ds/data/Mall_Customers.csv')
colnames(d) <- c('id', 'gender', 'age', 'income', 'score')

library(DT)
datatable(d, options = list(pageLength = 5))

NGROUPS <- 6  
COL <- rainbow(NGROUPS)  

km <- kmeans(d$income, centers = NGROUPS)

c <- vector('list', NGROUPS)

matplot(x = d$income, y = d$score, type = 'n', pch = 1)
grid()

for (i in 1:NGROUPS) {
  c[[i]] <- d[km$cluster == i, ]
  
  if (nrow(c[[i]]) > 0) {
    matpoints(x = c[[i]]$income, y = c[[i]]$score, pch = 16, col = COL[i])
  }
}

legend('topright', pch = 16, bg = 'white', col = COL[1:NGROUPS], legend = paste0('Group', 1:NGROUPS))

NGROUPS <- 6  
COL <- rainbow(NGROUPS)  

c <- kmeans(d$income, centers = NGROUPS)

c <- vector('list', NGROUPS)

matplot(x = d$income, y = d$score, type = 'n', pch = 1)
grid()

for (i in 1:NGROUPS) 
{
  c[[i]] <- d[km$cluster == i, ]
  matpoints(x = c[[i]]$income, y = c[[i]]$score, pch = 16, col = COL[i])
}

legend('topright', pch = 16, bg = 'white', col = COL[1:NGROUPS], legend = paste0('Group', 1:NGROUPS))

機械学習

演習課題

23150005

2024-10-08