機械学習

d <- read.csv('https://stats.dip.jp/01_ds/data/heart.data.csv')[, -1]

DT::datatable(round(d,1))

COL <- c(rgb(255,   0,   0,  105, max = 255), # 赤
         rgb(  0,   0, 255,  105, max = 255), # 青
         rgb(  0, 155,   0,  105, max = 255), # 緑
         rgb(100, 100, 100,   55, max = 255)) # 灰
library(e1071)

KERNEL <- c('linear',     # 線形
            'polynomial', # 多項式
            'sigmoid',    # シグモイド
            'radial')     # ガウス

k <- 1

cv <- tune('svm', heart.disease ~ biking, data = d,
           kernel = KERNEL[k], type = 'eps-regression', 
           ranges = list(#gamma   = 2^(-4:4),      # radialなどの非線形カーネルを使うとき調整
                         epsilon = seq(0, 1, 0.1), # polynomialかsigmoidのとき調整（c0）
                         #coef0   = 2^(-4:4),      # SVRの不感帯の調整
                         cost    = 2^(-4:4)))      # コスト係数（小さいほど分類誤りを許容）
cv

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  epsilon cost
##      0.8    4
## 
## - best performance: 2.617782

x.new <- seq(0, 100, 0.1)
yhat <- predict(cv$best.model, newdata = data.frame(biking = x.new))

epsilon <- cv$best.model$epsilon * cv$best.model$y.scale$`scaled:scale`
upr <- yhat + epsilon
lwr <- yhat - epsilon

matplot(x = d$biking, y = d$heart.disease, 
        type = 'p', pch = 1, col = COL[1],
        main = paste0('SVR（カーネル：', KERNEL[k], '，ε：', round(epsilon, 2), '）'),
        xlab = '自転車通勤の割合（％）',
        ylab = '心臓病の人口割合（％）')

# サポートベクター
sv <- d[cv$best.model$index, c('biking', 'heart.disease')]
matpoints(x = sv[, 1], y = sv[, 2], pch = 16, cex = 0.5, col = 1)

# 境界線とマージン（ε）
matlines(x = x.new, y = yhat, col = COL[2], lwd = 2)
matlines(x = x.new, y = upr,  col = COL[2], lty = 3)
matlines(x = x.new, y = lwr,  col = COL[2], lty = 3)

arrows(0, yhat[1], 0, upr[1], code = 3, length = 0.1)
arrows(0, yhat[1], 0, lwr[1], code = 3, length = 0.1)

library(latex2exp)
text(1, (yhat[1]+upr[1])/2, labels = TeX('$\\epsilon$'))
text(1, (yhat[1]+lwr[1])/2, labels = TeX('$\\epsilon$'))

legend('topright', 
       col = c(COL[1], 1, COL[2], COL[2]), pch = c(1, 16, NA, NA),
       lwd = c(1, 0.5, 2, 1),
       lty = c(NA, NA, 1, 3),
       border = F, bg = 'white',
       legend = c('人口割合', 'サポートベクター', '予測値', '境界線'))

library(e1071)

KERNEL <- c('linear',     # 線形
            'polynomial', # 多項式
            'sigmoid',    # シグモイド
            'radial')     # ガウス

k <- 1

cv <- tune('svm',  heart.disease ~ smoking, data = d,
           kernel = KERNEL[k], type = 'eps-regression', 
           ranges = list(#gamma   = 10^(-4:4),      # radialなどの非線形カーネルを使うとき調整
                         epsilon = seq(0, 2, 0.1), # polynomialかsigmoidのとき調整（c0）
                         #coef0   = 2^(-4:4),      # SVRの不感帯の調整
                         cost    = 2^(-4:4)))      # コスト係数（小さいほど分類誤りを許容）
cv

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  epsilon  cost
##      1.6 0.125
## 
## - best performance: 18.88784

x.new <- seq(0, 100, 0.1)
yhat <- predict(cv$best.model, newdata = data.frame(smoking = x.new))

epsilon <- cv$best.model$epsilon * cv$best.model$y.scale$`scaled:scale`
upr <- yhat + epsilon
lwr <- yhat - epsilon

matplot(x = d$smoking, y = d$heart.disease, 
        type = 'p', pch = 1, col = COL[1],
        main = paste0('SVR（カーネル：', KERNEL[k], '，ε：', round(epsilon, 2), '）'),
        xlab = '喫煙者割合（％）',
        ylab = '心臓病の人口割合（％）')

# サポートベクター
sv <- d[cv$best.model$index, c('smoking', 'heart.disease')]
matpoints(x = sv[, 1], y = sv[, 2], pch = 16, cex = 0.5, col = 1)

# 境界線とマージン（ε）
matlines(x = x.new, y = yhat, col = COL[2], lwd = 2)
matlines(x = x.new, y = upr,  col = COL[2], lty = 3)
matlines(x = x.new, y = lwr,  col = COL[2], lty = 3)

arrows(0, yhat[1], 0, upr[1], code = 3, length = 0.1)
arrows(0, yhat[1], 0, lwr[1], code = 3, length = 0.1)

library(latex2exp)
text(1, (yhat[1]+upr[1])/2, labels = TeX('$\\epsilon$'))
text(1, (yhat[1]+lwr[1])/2, labels = TeX('$\\epsilon$'))

legend('topright', 
       col = c(COL[1], 1, COL[2], COL[2]), pch = c(1, 16, NA, NA),
       lwd = c(1, 0.5, 2, 1),
       lty = c(NA, NA, 1, 3),
       border = F, bg = 'white',
       legend = c('人口割合', 'サポートベクター', '予測値', '境界線'))

n <- 24*7*2         # 時間数
t <- 1:n + rnorm(n) # 時刻

set.seed(5)

f <- function(t) 100 + 0.1*t + 0.02*t^2 + 100*sin(2*pi*t/24)

y <- f(t) + rnorm(n, sd = 5)

d <- data.frame(t, y)

matplot(x = d$t, y = d$y, type = 'o', pch = 16, col = COL[1],
        main = 'テストデータ', xlab = '時間', ylab = '値')
grid()
matlines(x = t, y = f(t), col = COL[2])

legend('topleft', 
       col = COL[1:2], pch = c(16, NA), lty = c(NA, 1),
       legend = c('価値観','真値'))
library(plotly)

##  要求されたパッケージ ggplot2 をロード中です

## 
##  次のパッケージを付け加えます: 'plotly'

##  以下のオブジェクトは 'package:ggplot2' からマスクされています:
## 
##     last_plot

##  以下のオブジェクトは 'package:latex2exp' からマスクされています:
## 
##     TeX

##  以下のオブジェクトは 'package:stats' からマスクされています:
## 
##     filter

##  以下のオブジェクトは 'package:graphics' からマスクされています:
## 
##     layout

plot_ly(type = "scatter", mode = "markers") |>
  add_trace(x = d$t, y = d$y,  mode = 'markers', name = "観測値", marker = list(color = COL[1])) |>
  add_trace(x = t,   y = f(t), mode = 'lines',   name = "真値",   line   = list(color = COL[2])) |>
  layout(title = "データ",
         xaxis = list(title = "時間"),
         yaxis = list(title = "値"))

機械学習

23150161

2024-10-25