機械学習

options(digits = 2) # 表示の有効桁数２桁に設定
d0 <- read.csv(file = 'https://stats.dip.jp/01_ds/data/car_data_jp.csv')
summary(d0) # 要約統計量

##    お客様番号       性別                年齢         年収           購入判断  
##  Min.   :   1   Length:1000        Min.   :18   Min.   : 15000   Min.   :0.0  
##  1st Qu.: 251   Class :character   1st Qu.:32   1st Qu.: 46375   1st Qu.:0.0  
##  Median : 500   Mode  :character   Median :40   Median : 72000   Median :0.0  
##  Mean   : 500                      Mean   :40   Mean   : 72689   Mean   :0.4  
##  3rd Qu.: 750                      3rd Qu.:48   3rd Qu.: 90000   3rd Qu.:1.0  
##  Max.   :1000                      Max.   :63   Max.   :152500   Max.   :1.0

head(d0)    # 先頭の5レコード表示

tail(d0)    # 末尾の5レコード表示

(n <- nrow(d0)) # 全レコードサイズ

## [1] 1000

# 乱数シード（乱数の種）：
# 無作為標本抽出でプログラム実行のたびに同じ標本が抽出される。
# 数字を変えると異なる組み合わせで毎回同じ標本が抽出される。
set.seed(5)

# 訓練データと試験データに分割
# 層化無作為標本抽出
# prop：訓練データの分割割合
# strata：設定した変数の割合がそれぞれのデータセットで均等になる（層化分割）。

library(rsample)

## Warning: パッケージ 'rsample' はバージョン 4.3.3 の R の下で造られました

d.trte <- initial_split(d0, prop = 4/5, strata = 購入判断)
d.trte

## <Training/Testing/Total>
## <799/201/1000>

d.tr <- training(d.trte) # 訓練データ
d.te <- testing (d.trte) # 試験データ

library(rsample)

fit.all <- glm(購入判断 ~ ., data = d.tr, family = 'binomial')

summary(fit.all)

## 
## Call:
## glm(formula = 購入判断 ~ ., family = "binomial", data = d.tr)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.24e+01   9.34e-01  -13.25   <2e-16 ***
## お客様番号   1.30e-04   3.56e-04    0.37     0.71    
## 性別男性     2.49e-01   2.10e-01    1.19     0.23    
## 年齢         2.22e-01   1.72e-02   12.93   <2e-16 ***
## 年収         3.43e-05   3.68e-06    9.32   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1076.60  on 798  degrees of freedom
## Residual deviance:  584.57  on 794  degrees of freedom
## AIC: 594.6
## 
## Number of Fisher Scoring iterations: 6

fit <- glm(購入判断 ~ 年齢 + 年収, data = d.tr, family = 'binomial')

summary(fit)

## 
## Call:
## glm(formula = 購入判断 ~ 年齢 + 年収, family = "binomial", 
##     data = d.tr)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.20e+01   8.74e-01  -13.78   <2e-16 ***
## 年齢         2.20e-01   1.69e-02   12.99   <2e-16 ***
## 年収         3.40e-05   3.67e-06    9.27   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1076.60  on 798  degrees of freedom
## Residual deviance:  586.09  on 796  degrees of freedom
## AIC: 592.1
## 
## Number of Fisher Scoring iterations: 6

d.new <- data.frame(年齢 = 45, 年収 = 80000)
p.hat <- predict(fit, type = 'response', newdata = d.new) # 予測確率

sprintf('新車購入確率：%2.1f％', p.hat * 100)

## [1] "新車購入確率：63.5％"

p.hat <- predict(fit, type = 'response', newdata = d.te)

threshold <- 0.5 # 閾値（しきいち），カットオフ値（cut-off）とも呼ばれる。

is.pred <- p.hat > threshold
is.ref <- d.te$購入判断 == 1

table(予測値 = is.pred, 真値 = is.ref)

##        真値
## 予測値 FALSE TRUE
##   FALSE   106   24
##   TRUE     14   57

is.ok <- is.pred == is.ref
n.ok <- sum(is.ok)

sprintf('新車購入予測精度：%2.1f％', n.ok / nrow(d.te) * 100)

## [1] "新車購入予測精度：81.1％"

# 混同（こんどう）行列 (confusion matrix)　caretパッケージ利用による詳細分析
library(caret)

## Warning: パッケージ 'caret' はバージョン 4.3.3 の R の下で造られました

##  要求されたパッケージ ggplot2 をロード中です

## Warning: パッケージ 'ggplot2' はバージョン 4.3.3 の R の下で造られました

##  要求されたパッケージ lattice をロード中です

confusionMatrix(data = as.factor(is.pred), 
                reference = as.factor(is.ref))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE   106   24
##      TRUE     14   57
##                                        
##                Accuracy : 0.811        
##                  95% CI : (0.75, 0.863)
##     No Information Rate : 0.597        
##     P-Value [Acc > NIR] : 7e-11        
##                                        
##                   Kappa : 0.599        
##                                        
##  Mcnemar's Test P-Value : 0.144        
##                                        
##             Sensitivity : 0.883        
##             Specificity : 0.704        
##          Pos Pred Value : 0.815        
##          Neg Pred Value : 0.803        
##              Prevalence : 0.597        
##          Detection Rate : 0.527        
##    Detection Prevalence : 0.647        
##       Balanced Accuracy : 0.794        
##                                        
##        'Positive' Class : FALSE        
##

library(pROC)

## Warning: パッケージ 'pROC' はバージョン 4.3.3 の R の下で造られました

## Type 'citation("pROC")' for a citation.

## 
##  次のパッケージを付け加えます: 'pROC'

##  以下のオブジェクトは 'package:stats' からマスクされています:
## 
##     cov, smooth, var

roc1 <- roc(response = d.te$購入判断, predict = p.hat,
            of = 'thresholds', thresholds = 'best', print.thres = 'best',
            percent = F, plot = T, print.auc = T, grid = T, ci = T, auc.polygon=T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

coords(roc1, 'best') # 最適閾値 (optimal threshold)

c <- coords(roc1) # 閾値 (threshold)ごとの値 (sensitivity, specificity)

library(DT)
datatable(round(c, 3))

d0 <- read.csv(file = 'https://stats.dip.jp/01_ds/data/titanic_data_jp.csv')

datatable(d0)

機械学習

演習課題

22150007

2024-09-17