次のデータを用いて新車購入の確率を予測する。

データ

User.ID:お客様番号
Gender:性別(Male:男性,Female:女性)
Age: 年齢
AnnualSalary:年収(US$)
Purchased:新車購入の有無(1:購入,0:未購入)

options(digits = 2)

d <- na.omit(read.csv(file = "car_data.csv"))
(n <- nrow(d))
## [1] 1000
set.seed(5)

ii <- sample(x = 1:n, size = 900)

d.train <- d[ii, ]
d.test  <- d[-ii, ]

head(d)
##   User.ID Gender Age AnnualSalary Purchased
## 1     385   Male  35        20000         0
## 2     681   Male  40        43500         0
## 3     353   Male  49        74000         0
## 4     895   Male  40       107500         1
## 5     661   Male  25        79000         0
## 6     846 Female  47        33500         1
summary(d)
##     User.ID        Gender               Age      AnnualSalary      Purchased  
##  Min.   :   1   Length:1000        Min.   :18   Min.   : 15000   Min.   :0.0  
##  1st Qu.: 251   Class :character   1st Qu.:32   1st Qu.: 46375   1st Qu.:0.0  
##  Median : 500   Mode  :character   Median :40   Median : 72000   Median :0.0  
##  Mean   : 500                      Mean   :40   Mean   : 72689   Mean   :0.4  
##  3rd Qu.: 750                      3rd Qu.:48   3rd Qu.: 90000   3rd Qu.:1.0  
##  Max.   :1000                      Max.   :63   Max.   :152500   Max.   :1.0

ロジスティック回帰モデル

\[ \begin{align} \hat{p}&=\frac{1}{1+e^{-(\beta_0+\beta_1 x_{i}+\cdots+\beta_k x_{k})}}\\ &ここで,\\ &\quad \hat{p}:確率の推定値\\ &\quad x_{k}: 説明変数\\ &\quad \beta_k: 偏回帰係数\\ \end{align} \]

ロジスティック回帰分析

データ適合(fitting)

fit0 <- glm(Purchased ~ ., data = d, family = "binomial")
summary(fit0)
## 
## Call:
## glm(formula = Purchased ~ ., family = "binomial", data = d)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.927  -0.593  -0.134   0.480   2.470  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.22e+01   8.21e-01  -14.88   <2e-16 ***
## User.ID       7.56e-05   3.16e-04    0.24    0.811    
## GenderMale    3.20e-01   1.86e-01    1.72    0.085 .  
## Age           2.19e-01   1.52e-02   14.47   <2e-16 ***
## AnnualSalary  3.37e-05   3.23e-06   10.42   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1347.6  on 999  degrees of freedom
## Residual deviance:  742.9  on 995  degrees of freedom
## AIC: 752.9
## 
## Number of Fisher Scoring iterations: 6
fit <- glm(Purchased ~ Gender + Age + AnnualSalary, 
           data = d, family = "binomial")
summary(fit)
## 
## Call:
## glm(formula = Purchased ~ Gender + Age + AnnualSalary, family = "binomial", 
##     data = d)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.928  -0.595  -0.135   0.478   2.476  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.22e+01   8.05e-01  -15.13   <2e-16 ***
## GenderMale    3.18e-01   1.86e-01    1.72    0.086 .  
## Age           2.19e-01   1.52e-02   14.47   <2e-16 ***
## AnnualSalary  3.37e-05   3.23e-06   10.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1347.63  on 999  degrees of freedom
## Residual deviance:  742.96  on 996  degrees of freedom
## AIC: 751
## 
## Number of Fisher Scoring iterations: 6

予測

年収80,000US$の45歳男性の新車購入確率を求める。

dp <- data.frame(Gender = "Male", Age = 45, AnnualSalary = 80000)
phat <- predict(fit, type = "response", newdata = dp)
sprintf("新車購入確率:%2.1f%", phat * 100)
## [1] "新車購入確率:67.1%"
phat <- predict(fit, type = "response", newdata = d.test)

# 混同行列(confusion matrix)

threshold <- 0.5 # 閾値(しきいち),カットオフ値(cut-off)とも呼ばれる。
is.pred <- phat > threshold
is.ref <- d.test$Purchased == 1
table(Prediction = is.pred, Reference = is.ref)
##           Reference
## Prediction FALSE TRUE
##      FALSE    58    8
##      TRUE      6   28
is.ok <- is.pred == is.ref
n.ok <- sum(is.ok)
sprintf("新車購入予測精度:%2.1f%", n.ok / nrow(d.test) * 100)
## [1] "新車購入予測精度:86.0%"
# 混同行列 (confusion matrix) caretパッケージを利用した詳細分析
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
confusionMatrix(data = as.factor(is.pred), 
                reference = as.factor(is.ref))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE    58    8
##      TRUE      6   28
##                                         
##                Accuracy : 0.86          
##                  95% CI : (0.776, 0.921)
##     No Information Rate : 0.64          
##     P-Value [Acc > NIR] : 8.06e-07      
##                                         
##                   Kappa : 0.692         
##                                         
##  Mcnemar's Test P-Value : 0.789         
##                                         
##             Sensitivity : 0.906         
##             Specificity : 0.778         
##          Pos Pred Value : 0.879         
##          Neg Pred Value : 0.824         
##              Prevalence : 0.640         
##          Detection Rate : 0.580         
##    Detection Prevalence : 0.660         
##       Balanced Accuracy : 0.842         
##                                         
##        'Positive' Class : FALSE         
## 

ROC曲線

用語

ROC (Receiver Operating Characteristic):受信者動作特性
AUC (Area Under Curve) :ROC曲線の下側の面積
Sensitivity (true positive rate) :真陽性率(感度)
Specificity (true negative rate) :真陰性率(特異度)

参考文献:【名古屋市立大学】感度・特異度・ROC曲線

library(pROC)
roc1 <- roc(response = d.test$Purchased, predict = phat,
            plot = T, print.auc = T, grid = T, ci = T, auc.polygon=T)

coords(roc1, "best") # 最適閾値 (optimal threshold)
##   threshold specificity sensitivity
## 1      0.38        0.83        0.92
coords(roc1) # 閾値 (threshold)ごとの値 (sensitivity, specificity)
##     threshold specificity sensitivity
## 1        -Inf       0.000       1.000
## 2      0.0030       0.016       1.000
## 3      0.0040       0.031       1.000
## 4      0.0044       0.047       1.000
## 5      0.0053       0.062       1.000
## 6      0.0071       0.078       1.000
## 7      0.0084       0.094       1.000
## 8      0.0108       0.109       1.000
## 9      0.0137       0.125       1.000
## 10     0.0154       0.141       1.000
## 11     0.0200       0.156       1.000
## 12     0.0244       0.172       1.000
## 13     0.0254       0.188       1.000
## 14     0.0264       0.203       1.000
## 15     0.0283       0.219       1.000
## 16     0.0301       0.234       1.000
## 17     0.0345       0.250       1.000
## 18     0.0385       0.266       1.000
## 19     0.0400       0.281       1.000
## 20     0.0426       0.297       1.000
## 21     0.0447       0.312       1.000
## 22     0.0459       0.328       1.000
## 23     0.0482       0.344       1.000
## 24     0.0499       0.359       1.000
## 25     0.0551       0.375       1.000
## 26     0.0604       0.391       1.000
## 27     0.0634       0.406       1.000
## 28     0.0690       0.422       1.000
## 29     0.0811       0.438       1.000
## 30     0.0906       0.453       1.000
## 31     0.0913       0.469       1.000
## 32     0.0956       0.484       1.000
## 33     0.1016       0.500       1.000
## 34     0.1113       0.516       1.000
## 35     0.1214       0.531       1.000
## 36     0.1281       0.547       1.000
## 37     0.1442       0.562       1.000
## 38     0.1566       0.578       1.000
## 39     0.1721       0.594       1.000
## 40     0.1958       0.594       0.972
## 41     0.2058       0.609       0.972
## 42     0.2085       0.625       0.972
## 43     0.2165       0.641       0.972
## 44     0.2236       0.656       0.972
## 45     0.2265       0.672       0.972
## 46     0.2293       0.688       0.972
## 47     0.2310       0.703       0.972
## 48     0.2328       0.719       0.972
## 49     0.2360       0.734       0.972
## 50     0.2432       0.750       0.972
## 51     0.2494       0.750       0.944
## 52     0.2693       0.766       0.944
## 53     0.2911       0.781       0.944
## 54     0.3206       0.797       0.944
## 55     0.3481       0.797       0.917
## 56     0.3656       0.812       0.917
## 57     0.3847       0.828       0.917
## 58     0.3910       0.828       0.889
## 59     0.4124       0.828       0.861
## 60     0.4331       0.844       0.861
## 61     0.4363       0.844       0.833
## 62     0.4419       0.859       0.833
## 63     0.4467       0.875       0.833
## 64     0.4533       0.891       0.833
## 65     0.4662       0.891       0.806
## 66     0.4851       0.891       0.778
## 67     0.5060       0.906       0.778
## 68     0.5212       0.906       0.750
## 69     0.5296       0.906       0.722
## 70     0.5334       0.906       0.694
## 71     0.5377       0.922       0.694
## 72     0.5412       0.922       0.667
## 73     0.5510       0.922       0.639
## 74     0.5620       0.922       0.611
## 75     0.5683       0.938       0.611
## 76     0.5800       0.953       0.611
## 77     0.6001       0.953       0.583
## 78     0.6379       0.953       0.556
## 79     0.6779       0.969       0.556
## 80     0.6928       0.969       0.528
## 81     0.7020       0.969       0.500
## 82     0.7410       0.969       0.472
## 83     0.8020       0.969       0.444
## 84     0.8349       0.969       0.417
## 85     0.8655       0.969       0.389
## 86     0.8970       0.984       0.389
## 87     0.9105       0.984       0.361
## 88     0.9286       0.984       0.333
## 89     0.9374       0.984       0.306
## 90     0.9481       0.984       0.278
## 91     0.9619       0.984       0.250
## 92     0.9715       1.000       0.250
## 93     0.9797       1.000       0.222
## 94     0.9829       1.000       0.194
## 95     0.9839       1.000       0.167
## 96     0.9854       1.000       0.139
## 97     0.9869       1.000       0.111
## 98     0.9900       1.000       0.083
## 99     0.9950       1.000       0.056
## 100    0.9981       1.000       0.028
## 101       Inf       1.000       0.000