次のデータを用いて新車購入の確率を予測する。
User.ID:お客様番号
Gender:性別(Male:男性,Female:女性)
Age: 年齢
AnnualSalary:年収(US$)
Purchased:新車購入の有無(1:購入,0:未購入)
options(digits = 2)
d <- na.omit(read.csv(file = "car_data.csv"))
(n <- nrow(d))
## [1] 1000
set.seed(5)
ii <- sample(x = 1:n, size = 900)
d.train <- d[ii, ]
d.test <- d[-ii, ]
head(d)
## User.ID Gender Age AnnualSalary Purchased
## 1 385 Male 35 20000 0
## 2 681 Male 40 43500 0
## 3 353 Male 49 74000 0
## 4 895 Male 40 107500 1
## 5 661 Male 25 79000 0
## 6 846 Female 47 33500 1
summary(d)
## User.ID Gender Age AnnualSalary Purchased
## Min. : 1 Length:1000 Min. :18 Min. : 15000 Min. :0.0
## 1st Qu.: 251 Class :character 1st Qu.:32 1st Qu.: 46375 1st Qu.:0.0
## Median : 500 Mode :character Median :40 Median : 72000 Median :0.0
## Mean : 500 Mean :40 Mean : 72689 Mean :0.4
## 3rd Qu.: 750 3rd Qu.:48 3rd Qu.: 90000 3rd Qu.:1.0
## Max. :1000 Max. :63 Max. :152500 Max. :1.0
\[ \begin{align} \hat{p}&=\frac{1}{1+e^{-(\beta_0+\beta_1 x_{i}+\cdots+\beta_k x_{k})}}\\ &ここで,\\ &\quad \hat{p}:確率の推定値\\ &\quad x_{k}: 説明変数\\ &\quad \beta_k: 偏回帰係数\\ \end{align} \]
fit0 <- glm(Purchased ~ ., data = d, family = "binomial")
summary(fit0)
##
## Call:
## glm(formula = Purchased ~ ., family = "binomial", data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.927 -0.593 -0.134 0.480 2.470
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.22e+01 8.21e-01 -14.88 <2e-16 ***
## User.ID 7.56e-05 3.16e-04 0.24 0.811
## GenderMale 3.20e-01 1.86e-01 1.72 0.085 .
## Age 2.19e-01 1.52e-02 14.47 <2e-16 ***
## AnnualSalary 3.37e-05 3.23e-06 10.42 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1347.6 on 999 degrees of freedom
## Residual deviance: 742.9 on 995 degrees of freedom
## AIC: 752.9
##
## Number of Fisher Scoring iterations: 6
fit <- glm(Purchased ~ Gender + Age + AnnualSalary,
data = d, family = "binomial")
summary(fit)
##
## Call:
## glm(formula = Purchased ~ Gender + Age + AnnualSalary, family = "binomial",
## data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.928 -0.595 -0.135 0.478 2.476
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.22e+01 8.05e-01 -15.13 <2e-16 ***
## GenderMale 3.18e-01 1.86e-01 1.72 0.086 .
## Age 2.19e-01 1.52e-02 14.47 <2e-16 ***
## AnnualSalary 3.37e-05 3.23e-06 10.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1347.63 on 999 degrees of freedom
## Residual deviance: 742.96 on 996 degrees of freedom
## AIC: 751
##
## Number of Fisher Scoring iterations: 6
年収80,000US$の45歳男性の新車購入確率を求める。
dp <- data.frame(Gender = "Male", Age = 45, AnnualSalary = 80000)
phat <- predict(fit, type = "response", newdata = dp)
sprintf("新車購入確率:%2.1f%", phat * 100)
## [1] "新車購入確率:67.1%"
phat <- predict(fit, type = "response", newdata = d.test)
# 混同行列(confusion matrix)
threshold <- 0.5 # 閾値(しきいち),カットオフ値(cut-off)とも呼ばれる。
is.pred <- phat > threshold
is.ref <- d.test$Purchased == 1
table(Prediction = is.pred, Reference = is.ref)
## Reference
## Prediction FALSE TRUE
## FALSE 58 8
## TRUE 6 28
is.ok <- is.pred == is.ref
n.ok <- sum(is.ok)
sprintf("新車購入予測精度:%2.1f%", n.ok / nrow(d.test) * 100)
## [1] "新車購入予測精度:86.0%"
# 混同行列 (confusion matrix) caretパッケージを利用した詳細分析
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
confusionMatrix(data = as.factor(is.pred),
reference = as.factor(is.ref))
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 58 8
## TRUE 6 28
##
## Accuracy : 0.86
## 95% CI : (0.776, 0.921)
## No Information Rate : 0.64
## P-Value [Acc > NIR] : 8.06e-07
##
## Kappa : 0.692
##
## Mcnemar's Test P-Value : 0.789
##
## Sensitivity : 0.906
## Specificity : 0.778
## Pos Pred Value : 0.879
## Neg Pred Value : 0.824
## Prevalence : 0.640
## Detection Rate : 0.580
## Detection Prevalence : 0.660
## Balanced Accuracy : 0.842
##
## 'Positive' Class : FALSE
##
ROC (Receiver Operating Characteristic):受信者動作特性
AUC (Area Under Curve) :ROC曲線の下側の面積
Sensitivity (true positive rate) :真陽性率(感度)
Specificity (true negative rate) :真陰性率(特異度)
library(pROC)
roc1 <- roc(response = d.test$Purchased, predict = phat,
plot = T, print.auc = T, grid = T, ci = T, auc.polygon=T)
coords(roc1, "best") # 最適閾値 (optimal threshold)
## threshold specificity sensitivity
## 1 0.38 0.83 0.92
coords(roc1) # 閾値 (threshold)ごとの値 (sensitivity, specificity)
## threshold specificity sensitivity
## 1 -Inf 0.000 1.000
## 2 0.0030 0.016 1.000
## 3 0.0040 0.031 1.000
## 4 0.0044 0.047 1.000
## 5 0.0053 0.062 1.000
## 6 0.0071 0.078 1.000
## 7 0.0084 0.094 1.000
## 8 0.0108 0.109 1.000
## 9 0.0137 0.125 1.000
## 10 0.0154 0.141 1.000
## 11 0.0200 0.156 1.000
## 12 0.0244 0.172 1.000
## 13 0.0254 0.188 1.000
## 14 0.0264 0.203 1.000
## 15 0.0283 0.219 1.000
## 16 0.0301 0.234 1.000
## 17 0.0345 0.250 1.000
## 18 0.0385 0.266 1.000
## 19 0.0400 0.281 1.000
## 20 0.0426 0.297 1.000
## 21 0.0447 0.312 1.000
## 22 0.0459 0.328 1.000
## 23 0.0482 0.344 1.000
## 24 0.0499 0.359 1.000
## 25 0.0551 0.375 1.000
## 26 0.0604 0.391 1.000
## 27 0.0634 0.406 1.000
## 28 0.0690 0.422 1.000
## 29 0.0811 0.438 1.000
## 30 0.0906 0.453 1.000
## 31 0.0913 0.469 1.000
## 32 0.0956 0.484 1.000
## 33 0.1016 0.500 1.000
## 34 0.1113 0.516 1.000
## 35 0.1214 0.531 1.000
## 36 0.1281 0.547 1.000
## 37 0.1442 0.562 1.000
## 38 0.1566 0.578 1.000
## 39 0.1721 0.594 1.000
## 40 0.1958 0.594 0.972
## 41 0.2058 0.609 0.972
## 42 0.2085 0.625 0.972
## 43 0.2165 0.641 0.972
## 44 0.2236 0.656 0.972
## 45 0.2265 0.672 0.972
## 46 0.2293 0.688 0.972
## 47 0.2310 0.703 0.972
## 48 0.2328 0.719 0.972
## 49 0.2360 0.734 0.972
## 50 0.2432 0.750 0.972
## 51 0.2494 0.750 0.944
## 52 0.2693 0.766 0.944
## 53 0.2911 0.781 0.944
## 54 0.3206 0.797 0.944
## 55 0.3481 0.797 0.917
## 56 0.3656 0.812 0.917
## 57 0.3847 0.828 0.917
## 58 0.3910 0.828 0.889
## 59 0.4124 0.828 0.861
## 60 0.4331 0.844 0.861
## 61 0.4363 0.844 0.833
## 62 0.4419 0.859 0.833
## 63 0.4467 0.875 0.833
## 64 0.4533 0.891 0.833
## 65 0.4662 0.891 0.806
## 66 0.4851 0.891 0.778
## 67 0.5060 0.906 0.778
## 68 0.5212 0.906 0.750
## 69 0.5296 0.906 0.722
## 70 0.5334 0.906 0.694
## 71 0.5377 0.922 0.694
## 72 0.5412 0.922 0.667
## 73 0.5510 0.922 0.639
## 74 0.5620 0.922 0.611
## 75 0.5683 0.938 0.611
## 76 0.5800 0.953 0.611
## 77 0.6001 0.953 0.583
## 78 0.6379 0.953 0.556
## 79 0.6779 0.969 0.556
## 80 0.6928 0.969 0.528
## 81 0.7020 0.969 0.500
## 82 0.7410 0.969 0.472
## 83 0.8020 0.969 0.444
## 84 0.8349 0.969 0.417
## 85 0.8655 0.969 0.389
## 86 0.8970 0.984 0.389
## 87 0.9105 0.984 0.361
## 88 0.9286 0.984 0.333
## 89 0.9374 0.984 0.306
## 90 0.9481 0.984 0.278
## 91 0.9619 0.984 0.250
## 92 0.9715 1.000 0.250
## 93 0.9797 1.000 0.222
## 94 0.9829 1.000 0.194
## 95 0.9839 1.000 0.167
## 96 0.9854 1.000 0.139
## 97 0.9869 1.000 0.111
## 98 0.9900 1.000 0.083
## 99 0.9950 1.000 0.056
## 100 0.9981 1.000 0.028
## 101 Inf 1.000 0.000