モデルの性能を評価する

分類の性能評価

予測値の評価

# 融資の焦付きデータ前準備
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.6
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

credit <- read.csv("credit.csv") %>% 
  mutate(default = as.factor(default))
set.seed(1)
### 1~1000の中で900個とる
train_sample <- sample(1000, 900)
### 訓練データ
credit_train <- credit[train_sample, ]
### テストデータ
credit_test <- credit[-train_sample, ]
library(C50)
## 目的変数を除外し, 因子ベクトルとして設定する
credit_model <- C5.0(credit_train[-17], credit_train$default,
                     trials = 1, costs = NULL)

# 予測値の確率を出す(ここではprobを用いる)
predict_pred <- predict(credit_model, credit_test, type = "prob")
head(predict_pred)

##           no       yes
## 26 0.7946061 0.2053939
## 46 0.8728003 0.1271997
## 50 0.8728003 0.1271997
## 63 0.2887121 0.7112879
## 68 0.1406667 0.8593333
## 78 0.9109009 0.0890991

# データセットの読み込み
sms_results <- read.csv("sms_results.csv", stringsAsFactors = T)
## スパムかどうか実際と予測, 予測値の確率が格納されている(ナイーブベイズで)
head(sms_results)

##   actual_type predict_type prob_spam prob_ham
## 1         ham          ham   0.00000  1.00000
## 2         ham          ham   0.00000  1.00000
## 3         ham          ham   0.00016  0.99984
## 4         ham          ham   0.00004  0.99996
## 5        spam         spam   1.00000  0.00000
## 6         ham          ham   0.00020  0.99980

# 予測値の確率40%~60%をみてみる
sms_results %>% 
  filter(prob_spam > 0.4 & prob_spam < 0.6) %>% 
  head()

##   actual_type predict_type prob_spam prob_ham
## 1        spam          ham   0.47536  0.52464
## 2         ham         spam   0.56188  0.43812
## 3         ham         spam   0.57917  0.42083

ほぼ運任せの結果なので外しているさらに間違って分類しているものをみてみる

sms_results %>% 
  filter(actual_type != predict_type) %>% 
  head()

##   actual_type predict_type prob_spam prob_ham
## 1        spam          ham   0.00071  0.99929
## 2        spam          ham   0.00156  0.99844
## 3        spam          ham   0.01708  0.98292
## 4        spam          ham   0.00851  0.99149
## 5        spam          ham   0.01243  0.98757
## 6        spam          ham   0.00003  0.99997

かなり高い確率のものでもミスっていることがわかる

混同行列を調べる

正解不正解を行列形式で表示する

# 混同行列
table(sms_results$actual_type, sms_results$predict_type)

##       
##         ham spam
##   ham  1203    4
##   spam   31  152

# より細かいテーブルを出す
library(gmodels)
CrossTable(sms_results$actual_type, sms_results$predict_type)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1390 
## 
##  
##                         | sms_results$predict_type 
## sms_results$actual_type |       ham |      spam | Row Total | 
## ------------------------|-----------|-----------|-----------|
##                     ham |      1203 |         4 |      1207 | 
##                         |    16.128 |   127.580 |           | 
##                         |     0.997 |     0.003 |     0.868 | 
##                         |     0.975 |     0.026 |           | 
##                         |     0.865 |     0.003 |           | 
## ------------------------|-----------|-----------|-----------|
##                    spam |        31 |       152 |       183 | 
##                         |   106.377 |   841.470 |           | 
##                         |     0.169 |     0.831 |     0.132 | 
##                         |     0.025 |     0.974 |           | 
##                         |     0.022 |     0.109 |           | 
## ------------------------|-----------|-----------|-----------|
##            Column Total |      1234 |       156 |      1390 | 
##                         |     0.888 |     0.112 |           | 
## ------------------------|-----------|-----------|-----------|
## 
##

# 正解率と誤分類率
##  正解率
(1203 + 152) / (1203 + 4 + 31 + 152)

## [1] 0.9748201

## 誤分類率
(4 + 31) / (1203 + 4 + 31 + 152)

## [1] 0.02517986

正解率以外の指標

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(e1071)
# spamを検出したいのでpositive = "spam"を指定する
confusionMatrix(sms_results$actual_type, sms_results$predict_type,
                positive = "spam")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  ham spam
##       ham  1203    4
##       spam   31  152
##                                           
##                Accuracy : 0.9748          
##                  95% CI : (0.9652, 0.9824)
##     No Information Rate : 0.8878          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8825          
##                                           
##  Mcnemar's Test P-Value : 1.109e-05       
##                                           
##             Sensitivity : 0.9744          
##             Specificity : 0.9749          
##          Pos Pred Value : 0.8306          
##          Neg Pred Value : 0.9967          
##              Prevalence : 0.1122          
##          Detection Rate : 0.1094          
##    Detection Prevalence : 0.1317          
##       Balanced Accuracy : 0.9746          
##                                           
##        'Positive' Class : spam            
##

k統計量(Kappa):調整した正解率。0.8以上でVery good。
感度:正しく分類された陽性インスタンスの割合
特異度:正しく分類された陰性インスタンスの割合

# 感度(真陽性率)
sens <- 152 / (152 + 31)
sens

## [1] 0.8306011

# 特異度(真陰性率)
spec <- 1203 / (1203 + 4)
spec

## [1] 0.996686

適合率:陽性インスタンスのうち真陽性だったもの
再現率:結果がどれくらい完全であるか

# 適合率
## 分母は陽性と分類されたもの
## 分子は本当に陽性でかつ陽性と分類されたもの
pre <- 152 / (152 + 4)
pre

## [1] 0.974359

# 再現率
rec <- 152 / (152 + 31)
rec

## [1] 0.8306011

F値:調和平均を使って適合率と再現率を組み合わせたもの \[F = \frac{2 * Pre * Rec}{Rec + Pre} = \frac{2 * TP}{2 * TP + FP + FN}\]

f <- (2 * pre * rec) / (pre + rec)
f

## [1] 0.8967552

ROC曲線

真陽性率を縦軸, 偽陽性率を横軸としてプロットしたもの. 完全な分類ができているほど真陽性率が100%に近づき, 偽陽性率が0%に近く.予測価値のないものは\(y=x\)である

# ROC曲線を描く
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following object is masked from 'package:gmodels':
## 
##     ci

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

## 1つ目のベクトルに正解ラベル, 2つ目のベクトルに陽性クラスの推定確率
sms_roc <- roc(sms_results$actual_type, sms_results$prob_spam)

## Setting levels: control = ham, case = spam

## Setting direction: controls < cases

plot(sms_roc, main = "ROC curve", col = "blue",
     lvd = 2, legacy.axes = TRUE)

# auc
sms_roc$auc

## Area under the curve: 0.9836

aucの値が0.9以上だとランクA

# 2つのモデルを比べる
## k最近傍法で予測したもの
sms_results_knn <- read.csv("sms_results_knn.csv")
sms_roc_knn <- roc(sms_results$actual_type, sms_results_knn$p_spam)

## Setting levels: control = ham, case = spam

## Setting direction: controls < cases

## ROC曲線の比較
plot(sms_roc, main = "ROC curve", col = "blue",
     lvd = 2, legacy.axes = TRUE)
plot(sms_roc_knn, col = "red", lwd = 2, add = T)

auc(sms_roc_knn)

## Area under the curve: 0.8942

将来の性能を推定する

ホールドアウト法

訓練データ50%, テストデータ25%, 検証データ25%が一般的な比率となる。

# データセットの読み込み
library(tidyverse)
credit <- read.csv("credit.csv") %>% 
  mutate(default = as.factor(default))

1000個のデータを分割していく

# 乱数発生
#orderで乱数の順番を返す
random_ids <- order(runif(1000))
## 訓練データ
credit_train <- credit[random_ids[1:500], ]
## 検証データ
credit_validate <- credit[random_ids[501:750], ]
## テストデータ
credit_test <- credit[random_ids[751:1000], ]

層化サンプリングをすることで, 各データセットでのクラスの比率がほぼ同じになる

# 層化抽出法
library(caret)
## サブセットに割り当てるたインスタンスの割合を指定
## 75%分が訓練データに割り当てられ, defaultの比率はそれぞれ一緒になる
in_train <- createDataPartition(credit$default, p = 0.75, list = F)
credit_train <- credit[in_train, ]
credit_test <- credit[-in_train, ]
## 全て7:3になってる
table(credit$default)

## 
##  no yes 
## 700 300

table(credit_train$default)

## 
##  no yes 
## 525 225

table(credit_test$default)

## 
##  no yes 
## 175  75

ただしホールドアウト法はサンプリングにバイアスがかかっている場合がある。(全ての情報が均等に分配されてるわけではないので)。

交差検証

クロスバリデーションを行う。10分割であれば, 10フォールド中9フォールドでモデルを訓練し, 残り1フォールドで評価。これを10回行う。結果の平均性能を計算。

# クロスバリデーション
## フォールドの作成, リストが作成される
folds <- createFolds(credit$default, k = 10)
## このような作業を10回やる
credit01_test <- credit[folds$Fold01, ]
credit01_train <- credit[-folds$Fold01, ]

# まとめて実行する
library(C50)
library(irr)

## Loading required package: lpSolve

cv_results <- map(folds, function(x){
  ## 訓練データ
  credit_train <- credit[-x, ]
  ## テストデータ
  credit_test <- credit[x, ]
  ## 決定木モデル式
  credit_model <- C5.0(default ~ ., data = credit_train)
  ## 予測
  credit_pred <- predict(credit_model, credit_test)
  ## テストデータの正解値
  credit_actual <- credit_test$default
  ## 正解率の計算
  kappa <- kappa2(data.frame(credit_actual, credit_pred))$value
  return(kappa)
}
)

str(cv_results)

## List of 10
##  $ Fold01: num 0.227
##  $ Fold02: num 0.0833
##  $ Fold03: num 0.33
##  $ Fold04: num 0.124
##  $ Fold05: num 0.307
##  $ Fold06: num 0.554
##  $ Fold07: num 0.444
##  $ Fold08: num 0.349
##  $ Fold09: num 0.294
##  $ Fold10: num 0.429

# 平均を求める
mean(unlist(cv_results))

## [1] 0.3141224

値としてそれほど性能はよくないことがわかる

ブートストラップ

ランダムに訓練データとテストデータを分割する。クロスバリデーションと違い, 重複を許す。

モデルの性能を評価する

Ei Taguchi

分類の性能評価

予測値の評価

混同行列を調べる

正解率以外の指標

ROC曲線

将来の性能を推定する

ホールドアウト法

ホールドアウト法

交差検証

ブートストラップ