このページは公開されるので個人情報などは記載しないこと。
課題が完成したらknitしPublishする。

課題名

rm(list = ls()) # 全オブジェクト削除
# ここにRコードを記入する。
#例）
plot(1:9)

#主成分分析

d <- read.csv("https://stats.dip.jp/01_ds/data/UN_jp.csv")

library(DT)
datatable(d,caption="United Nation")

r <- prcomp(d[,4:8], scale = T) # scale = T: 相関行列， F: 分散共分散行列を利用
# 【注意】国名のカラム（4〜8番目）を除いているd[, 4:8]

summary(r)

## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5
## Standard deviation     1.9015 0.8551 0.63807 0.42872 0.24968
## Proportion of Variance 0.7231 0.1462 0.08143 0.03676 0.01247
## Cumulative Proportion  0.7231 0.8693 0.95077 0.98753 1.00000

options(digits = 1) # 表示有効数字2桁
(variance <- r$sdev^2) # 分散（変動），固有値

## [1] 3.62 0.73 0.41 0.18 0.06

(proportion_variance <- variance / sum(variance)) # 変動割合

## [1] 0.72 0.15 0.08 0.04 0.01

(cumulative_propotion <- cumsum(proportion_variance)) # 累積変動割合

## [1] 0.7 0.9 1.0 1.0 1.0

evec <- r$rotation

datatable(round(evec, 2))

rownames(r$x) <- d$国名

datatable(round(r$x, 2))

library(factoextra)

##  要求されたパッケージ ggplot2 をロード中です

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

fviz_screeplot(r, addlabels = T)

fviz_contrib(r, choice = "var", axes = 1, top = 5)

fviz_contrib(r, choice = "var", axes = 2, top = 5)

library("corrplot")

## corrplot 0.92 loaded

var <- get_pca_var(r)
corrplot(var$cor, is.corr = T, addCoef.col = "gray")

fviz_pca_var(r, 
             col.var = "contrib", # 色分け 
             repel = T) # repel: テキストラベルの重なり防止

fviz_pca_biplot(r, col.ind = "contrib", repel = T)

‘Q1, 第1,2主成分はどのような事象を表す指標であるか.’ ‘第1主成分は健康’ ‘第2主成分は経済’

‘Q2, 最も経済的に豊かで健康に過ごせる国はどこか.’ ‘Luxembourg’

#階層的クラスタリング

library(DT)

d <- read.csv('https://stats.dip.jp/01_ds/data/Mall_Customers.csv')

colnames(d) <- c('id', 'gender', 'age', 'income', 'score')

d$gender <- ifelse(d$gender == 'Male', 1, 0)
datatable(d, options = list(pageLength = 5))

NGROUPS <- 2

# カラーパレット
COL <- rainbow(NGROUPS)

matplot(x = d$income, y = d$score, pch = 16, type = 'p', col = COL[1])
grid()

pairs(d[,c("age","income","score")],
      col=3+as.numeric(d$gender),
      pch=16+as.numeric(d$gender),
      lower.panel=NULL,oma=c(3,3,5,3),
      main="ショッピングモール顧客データ")

par(xpd = T)
legend('bottomleft', col = 4:5, pch = 16:17, legend = unique(d$gender))#ユニークは重複をなくす

#階層的クラスター分析
library(cluster)
library(factoextra)

# AGNES
hc.a <- agnes(d[,c("income","score")])
fviz_dend(as.hclust(hc.a), k = 4, horiz = T, rect = T, rect_fill = T,
 color_labels_by_k = F, rect_border = 'jco', k_colors = 'jco', cex = 0.4)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

非階層的クラスタリング

d <- read.csv('https://stats.dip.jp/01_ds/data/Mall_Customers.csv')
colnames(d) <- c('id', 'gender', 'age', 'income', 'score')

library(DT)
datatable(d, options = list(pageLength = 5))

NGROUPS <- 2

COL <- rainbow(NGROUPS)

matplot(x = d$income, y = d$score, pch = 16, type = 'p', col = COL[1])
grid()

NGROUPS <- 5

COL <- rainbow(NGROUPS)

km <- kmeans(d[,c("income","score")], centers = NGROUPS, nstart = 25)
c <- vector('list', NGROUPS)
name.group <- rep(NA, NGROUPS)
matplot(x = d$income, y = d$score, type = 'n')
grid()

for (i in 1:NGROUPS)
{
  c[[i]] <- d[km$cluster == i, ]
  
  matpoints(x = c[[i]]$income,
            y = c[[i]]$score,
            pch = 16,
            col = COL[i])
}
  
legend('topright', pch = 16, col = COL[1:NGROUPS],
        legend = paste0("Group",1:NGROUPS))

#サポートベクターマシーン

library(MASS)
n <- 100
c <- vector('list', 7)
c[[1]] <- mvrnorm(n, mu = c( 0,  0), Sigma = rbind(c(2.0,  0.0), c( 0.0, 2.0)))
c[[2]] <- mvrnorm(n, mu = c( 0, 10), Sigma = rbind(c(2.0, -0.8), c(-0.8, 2.0)))
c[[3]] <- mvrnorm(n, mu = c(10,  0), Sigma = rbind(c(2.0, -0.8), c(-0.8, 2.0)))
c[[4]] <- mvrnorm(n, mu = c(-5, -5), Sigma = rbind(c(2.0,  0.8), c( 0.8, 2.0)))
c[[5]] <- mvrnorm(n, mu = c( 5,  5), Sigma = rbind(c(2.0,  0.8), c( 0.8, 2.0)))
c[[6]] <- mvrnorm(n, mu = c(-5,  5), Sigma = rbind(c(2.0, -0.8), c(-0.8, 2.0)))
c[[7]] <- mvrnorm(n, mu = c( 5, -5), Sigma = rbind(c(2.0, -0.8), c(-0.8, 2.0)))

for (i in seq_along(c))
{
  c[[i]] <- as.data.frame(c[[i]])
  colnames(c[[i]]) <- c('x', 'y')
}

# 単純な分類用データ
d <- data.frame(c(rep(1, n), rep(0, n)), rbind(c[[1]], c[[5]]))
colnames(d) <- c('blue', 'x', 'y')



# カラーパレット
COL <- c(rgb(255,   0,   0,  105, max = 255), # 赤
         rgb(  0,   0, 255,  105, max = 255), # 青
         rgb(  0, 155,   0,  105, max = 255), # 緑
         rgb(100, 100, 100,   20, max = 255)) # 灰

draw.fig <- function()
{
  # 図枠
  matplot (NA, type = 'n', xlim = c(-10, 15), ylim = c(-10, 20),
          xlab = 'x', ylab = 'y')

  grid() # 格子線 

  # 描画
  matlines(x = c[[1]]$x, y = c[[1]]$y, type = 'p', pch = 1, col = COL[2])
  #matlines(x = c[[2]]$x, y = c[[2]]$y, type = 'p', pch = 1, col = COL[2])
  #matlines(x = c[[3]]$x, y = c[[3]]$y, type = 'p', pch = 1, col = COL[2])
  #matlines(x = c[[4]]$x, y = c[[4]]$y, type = 'p', pch = 1, col = COL[1])
  matlines(x = c[[5]]$x, y = c[[5]]$y, type = 'p', pch = 1, col = COL[1])
  #matlines(x = c[[6]]$x, y = c[[6]]$y, type = 'p', pch = 1, col = COL[1])
  #matlines(x = c[[7]]$x, y = c[[7]]$y, type = 'p', pch = 1, col = COL[1])
}

#cairo_pdf('data_svm.pdf') # PDF画像作成（ここから）
draw.fig()

# 凡例
legend('topright', col = COL[1:2], pch = c(1, 1), bg = 'white',
      legend = c('赤', '青'))

library(e1071)

# カーネル
KERNEL <- c('linear', 'polynomial', 'sigmoid', 'radial')

k <- 1 # カーネル選択番号

# 交差検証法によるパラメータ選択
cv <- tune('svm', as.factor(blue) ~ ., data = d,
           kernel = KERNEL[k], type = 'C-classification', 
           ranges = list(#gamma   = 2^(-4:4), 
                         #epsilon = seq(0, 1, 0.1),
                         #coef0   = 2^(-4:4),
                         cost    = 2^(-4:4)))
# 結果表示
cv

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##  0.06
## 
## - best performance: 0.005

dx <- 0.2
dy <- 0.2

# 格子点データを作成
dgrid <- expand.grid(x = seq(-25, 25, dx),
                     y = seq(-25, 25, dy))

# 格子点を予測
pred <- predict(cv$best.model, newdata = dgrid)

#cairo_pdf('svm.pdf')
draw.fig()

# サポートベクター
sv <- d[cv$best.model$index, -1]
matpoints(x = sv[, 1], y = sv[, 2], pch = 16, cex = 0.5, col = 1)

dgrid.blue <- dgrid[pred == 1, ]

#matpoints(x = dgrid.blue$x, y = dgrid.blue$y, pch = 15, cex = 1.1, col = COL[4])

# 灰色塗り関数
fill.cell <- function(x, y)
{
  xline <- c(x - dx/2, x + dx/2)
  ylow  <- c(y - dy/2, y - dy/2)
  yupp  <- c(y + dy/2, y + dy/2)

  polygon(c(xline, rev(xline)), c(ylow, yupp), border = F, col = COL[4])
}

# 予測値が１の周りを正方形で灰色塗り
for (i in 1:nrow(dgrid))
{
  if (pred[i] == 1) fill.cell(dgrid$x[i], dgrid$y[i])
}

# グラフ主タイトル
title(paste0('SVM（カーネル：', KERNEL[k], '）による分類'))

# 凡例
legend('topright', col = c(COL[1:2], 1, NA), pch = c(1, 1, 16, NA),
       fill = c(NA, NA, NA, COL[4]), border = F, bg = 'white',
       legend = c('赤（０）', '青（１）', 'サポートベクター', '青（１）と分類する範囲'))

#cairo_pdf('red_blue.pdf')
matplot (NA, type = 'n', xlim = c(-10, 10), ylim = c(-10, 10),
        xlab = 'x', ylab = 'y')
grid()
matlines(x = c[[1]]$x, y = c[[1]]$y, type = 'p', pch = 1, col = COL[2])
matlines(x = c[[4]]$x, y = c[[4]]$y, type = 'p', pch = 1, col = COL[1])
matlines(x = c[[5]]$x, y = c[[5]]$y, type = 'p', pch = 1, col = COL[1])
matlines(x = c[[6]]$x, y = c[[6]]$y, type = 'p', pch = 1, col = COL[1])
matlines(x = c[[7]]$x, y = c[[7]]$y, type = 'p', pch = 1, col = COL[1])

legend('topright', col = COL[1:2], pch = c(1, 1), bg = 'white',
      legend = c('赤', '青'))

#dev.off()

library(plot3D)
f <- function(x, y) x^2 + y^2

x.g <- seq(-50, 50, 5)
y.g <- seq(-50, 50, 5)
z.g <- outer(x.g, y.g, function(x, y) x*0 + y*0 + 10)

library(latex2exp)
#cairo_pdf('kernel_trick.pdf')
scatter3D(x = c[[1]]$x, y = c[[1]]$y, z = f(c[[1]]$x, c[[1]]$y), 
          pch = 16, col = COL[2], bty = 'f', ticktype = 'detailed',
          theta = 45, phi = 15,
          main = TeX('$z = x^2 + y^2'),
          xlim = c(-10, 10),
          ylim = c(-10, 10),
          zlim = c(0, 100),
          surf = list(x = x.g, y = y.g, z = z.g, facet = NA, border = 'green'))

scatter3D(x = c[[4]]$x, y = c[[4]]$y, z = f(c[[4]]$x, c[[4]]$y), pch = 16, col = COL[1], add = T) 
scatter3D(x = c[[5]]$x, y = c[[5]]$y, z = f(c[[5]]$x, c[[5]]$y), pch = 16, col = COL[1], add = T) 
scatter3D(x = c[[6]]$x, y = c[[6]]$y, z = f(c[[6]]$x, c[[6]]$y), pch = 16, col = COL[1], add = T) 
scatter3D(x = c[[7]]$x, y = c[[7]]$y, z = f(c[[7]]$x, c[[7]]$y), pch = 16, col = COL[1], add = T)

KERNEL # カーネル関数の種類

## [1] "linear"     "polynomial" "sigmoid"    "radial"

 # 複雑な分類用データ
  d <- data.frame(c(rep(1, n*3), rep(0, n*4)),
                  rbind(c[[1]], c[[2]], c[[3]], c[[4]], c[[5]], c[[6]], c[[7]]))
  colnames(d) <- c('blue', 'x', 'y')
  head(d)

##   blue     x    y
## 1    1  1.13 -1.8
## 2    1  0.70 -0.6
## 3    1 -0.53 -0.2
## 4    1 -0.42  0.2
## 5    1  0.84  1.9
## 6    1 -0.03  2.5

 # 図枠
  matplot (NA, type = 'n',
           xlim = c(-10, 15), ylim = c(-10, 20),
           xlab = 'x', ylab = 'y')

  grid() # 格子線

  # データ抽出
  d.red  <- d[d$blue == 0, ] # 赤データ
  d.blue <- d[d$blue == 1, ] # 青データ

  # 描画
  matlines(x = d.red$x,  y = d.red$y,  type = 'p', pch = 1, col = COL[1])
  matlines(x = d.blue$x, y = d.blue$y, type = 'p', pch = 1, col = COL[2])

  # 凡例
  legend('topright', col = COL[1:2], pch = c(1, 1), bg = 'white',
        legend = c('赤', '青'))

機械学習

演習課題

2023-10-20

課題名

非階層的クラスタリング