機械学習

options(digits = 2) 
d0 <- read.csv(file = 'https://stats.dip.jp/01_ds/data/car_data_jp.csv')
summary(d0)

##    お客様番号       性別                年齢         年収           購入判断  
##  Min.   :   1   Length:1000        Min.   :18   Min.   : 15000   Min.   :0.0  
##  1st Qu.: 251   Class :character   1st Qu.:32   1st Qu.: 46375   1st Qu.:0.0  
##  Median : 500   Mode  :character   Median :40   Median : 72000   Median :0.0  
##  Mean   : 500                      Mean   :40   Mean   : 72689   Mean   :0.4  
##  3rd Qu.: 750                      3rd Qu.:48   3rd Qu.: 90000   3rd Qu.:1.0  
##  Max.   :1000                      Max.   :63   Max.   :152500   Max.   :1.0

head(d0)

tail(d0)

(n <- nrow(d0))

## [1] 1000

set.seed(5)


library(rsample)
d.trte <- initial_split(d0, prop = 4/5, strata = 購入判断)
d.trte

## <Training/Testing/Total>
## <799/201/1000>

d.tr <- training(d.trte)
print(head(d.tr))

##   お客様番号 性別 年齢  年収 購入判断
## 1        385 男性   35 20000        0
## 2        681 男性   40 43500        0
## 3        353 男性   49 74000        0
## 4        661 男性   25 79000        0
## 5        588 男性   42 64000        0
## 6         85 女性   30 84500        0

fit <- glm(購入判断 ~ 年齢 + 年収, data = d.tr, family = 'binomial')
summary(fit)

## 
## Call:
## glm(formula = 購入判断 ~ 年齢 + 年収, family = "binomial", 
##     data = d.tr)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.20e+01   8.74e-01  -13.78   <2e-16 ***
## 年齢         2.20e-01   1.69e-02   12.99   <2e-16 ***
## 年収         3.40e-05   3.67e-06    9.27   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1076.60  on 798  degrees of freedom
## Residual deviance:  586.09  on 796  degrees of freedom
## AIC: 592.1
## 
## Number of Fisher Scoring iterations: 6

d.new <- data.frame(年齢 = 45, 年収 = 80000)
p.hat <- predict(fit, type = 'response', newdata = d.new) 

sprintf('新車購入確率：%2.1f％', p.hat * 100)

## [1] "新車購入確率：63.5％"

d.te <- testing(d.trte)

print(head(d.te))

##   お客様番号 性別 年齢   年収 購入判断
## 1        895 男性   40 107500        1
## 2        219 女性   46 132500        1
## 3        790 女性   32  72500        0
## 4        134 女性   49  39000        1
## 5        294 女性   41  61500        0
## 6        597 女性   31 117500        0

p.hat <- predict(fit, type = 'response', newdata = d.te)

threshold <- 0.5 

is.pred <- p.hat > threshold
is.ref <- d.te$購入判断 == 1

table(予測値 = is.pred, 真値 = is.ref)

##        真値
## 予測値 FALSE TRUE
##   FALSE   106   24
##   TRUE     14   57

is.ok <- is.pred == is.ref
n.ok <- sum(is.ok)

sprintf('新車購入予測精度：%2.1f％', n.ok / nrow(d.te) * 100)

## [1] "新車購入予測精度：81.1％"

library(caret)

##  要求されたパッケージ ggplot2 をロード中です

##  要求されたパッケージ lattice をロード中です

confusionMatrix(data = as.factor(is.pred), 
                reference = as.factor(is.ref))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE   106   24
##      TRUE     14   57
##                                        
##                Accuracy : 0.811        
##                  95% CI : (0.75, 0.863)
##     No Information Rate : 0.597        
##     P-Value [Acc > NIR] : 7e-11        
##                                        
##                   Kappa : 0.599        
##                                        
##  Mcnemar's Test P-Value : 0.144        
##                                        
##             Sensitivity : 0.883        
##             Specificity : 0.704        
##          Pos Pred Value : 0.815        
##          Neg Pred Value : 0.803        
##              Prevalence : 0.597        
##          Detection Rate : 0.527        
##    Detection Prevalence : 0.647        
##       Balanced Accuracy : 0.794        
##                                        
##        'Positive' Class : FALSE        
##

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
##  次のパッケージを付け加えます: 'pROC'

##  以下のオブジェクトは 'package:stats' からマスクされています:
## 
##     cov, smooth, var

roc1 <- roc(response = d.te$購入判断, predict = p.hat,
            of = 'thresholds', thresholds = 'best', print.thres = 'best',
            percent = F, plot = T, print.auc = T, grid = T, ci = T, auc.polygon=T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

coords(roc1, 'best')

c <- coords(roc1) 

library(DT)
datatable(round(c, 3))

d0 <- read.csv(file = 'https://stats.dip.jp/01_ds/data/titanic_data_jp.csv')

datatable(d0)

d0$生死 <- ifelse(d0$生死 == "生死", 1, 0)

head(d0)

tail(d0)

(n <- nrow(d0))

## [1] 714

set.seed(5)


library(rsample)
d.trte <- initial_split(d0, prop = 4/5, strata = 生死)
d.trte

## <Training/Testing/Total>
## <571/143/714>

fit.all <- glm(購入判断 ~ ., data = d.tr, family = 'binomial')

summary(fit.all)

## 
## Call:
## glm(formula = 購入判断 ~ ., family = "binomial", data = d.tr)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.24e+01   9.34e-01  -13.25   <2e-16 ***
## お客様番号   1.30e-04   3.56e-04    0.37     0.71    
## 性別男性     2.49e-01   2.10e-01    1.19     0.23    
## 年齢         2.22e-01   1.72e-02   12.93   <2e-16 ***
## 年収         3.43e-05   3.68e-06    9.32   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1076.60  on 798  degrees of freedom
## Residual deviance:  584.57  on 794  degrees of freedom
## AIC: 594.6
## 
## Number of Fisher Scoring iterations: 6

機械学習

演習課題

23150005

2024-09-17