機械学習

options(digits = 2) 
d0 <- read.csv(file = 'https://stats.dip.jp/01_ds/data/titanic_data_jp.csv')
d0 <- na.omit(d0)
d0$生死 <- ifelse(d0$生死=='生存',1,0)
library(DT)
datatable(d0)

summary(d0)

##    乗船番号              生死        客室等級             名前          
##  Length:714         Min.   :0.00   Length:714         Length:714        
##  Class :character   1st Qu.:0.00   Class :character   Class :character  
##  Mode  :character   Median :0.00   Mode  :character   Mode  :character  
##                     Mean   :0.41                                        
##                     3rd Qu.:1.00                                        
##                     Max.   :1.00                                        
##      性別                年齢     兄弟配偶者数     親子数         運賃    
##  Length:714         Min.   : 0   Min.   :0.0   Min.   :0.0   Min.   :  0  
##  Class :character   1st Qu.:20   1st Qu.:0.0   1st Qu.:0.0   1st Qu.:  8  
##  Mode  :character   Median :28   Median :0.0   Median :0.0   Median : 16  
##                     Mean   :30   Mean   :0.5   Mean   :0.4   Mean   : 35  
##                     3rd Qu.:38   3rd Qu.:1.0   3rd Qu.:1.0   3rd Qu.: 33  
##                     Max.   :80   Max.   :5.0   Max.   :6.0   Max.   :512  
##     乗船地         
##  Length:714        
##  Class :character  
##  Mode  :character  
##                    
##                    
##

head(d0)

##   乗船番号 生死 客室等級      名前 性別 年齢 兄弟配偶者数 親子数 運賃 乗船地
## 1     No.1    0      3等    Braund 男性   22            1      0  7.2      S
## 2     No.2    1      1等   Cumings 女性   38            1      0 71.3      C
## 3     No.3    1      3等 Heikkinen 女性   26            0      0  7.9      S
## 4     No.4    1      1等  Futrelle 女性   35            1      0 53.1      S
## 5     No.5    0      3等     Allen 男性   35            0      0  8.1      S
## 6     No.7    0      1等  McCarthy 男性   54            0      0 51.9      S

tail(d0)

##     乗船番号 生死 客室等級     名前 性別 年齢 兄弟配偶者数 親子数 運賃 乗船地
## 709   No.885    0      3等 Sutehall 男性   25            0      0  7.0      S
## 710   No.886    0      3等     Rice 女性   39            0      5 29.1      Q
## 711   No.887    0      2等 Montvila 男性   27            0      0 13.0      S
## 712   No.888    1      1等   Graham 女性   19            0      0 30.0      S
## 713   No.890    1      1等     Behr 男性   26            0      0 30.0      C
## 714   No.891    0      3等   Dooley 男性   32            0      0  7.8      Q

(n <- nrow(d0))

## [1] 714

set.seed(5)
library(rsample)
d.trte <- initial_split(d0, prop = 4/5, strata = 生死)
d.trte

## <Training/Testing/Total>
## <571/143/714>

d.tr <- training(d.trte) 
d.te <- testing (d.trte)

fit.all <- glm(生死 ~ ., data = d.tr, family = 'binomial')

## Warning: glm.fit: アルゴリズムは収束しませんでした

fit <- glm(生死 ~ 年齢 + 客室等級 + 性別, data = d.tr, family = 'binomial')
summary(fit)

## 
## Call:
## glm(formula = 生死 ~ 年齢 + 客室等級 + 性別, family = "binomial", 
##     data = d.tr)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   3.7215     0.4499    8.27  < 2e-16 ***
## 年齢         -0.0349     0.0085   -4.11    4e-05 ***
## 客室等級2等  -1.1619     0.3163   -3.67  0.00024 ***
## 客室等級3等  -2.5386     0.3164   -8.02    1e-15 ***
## 性別男性     -2.6201     0.2356  -11.12  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 771.4  on 570  degrees of freedom
## Residual deviance: 508.5  on 566  degrees of freedom
## AIC: 518.5
## 
## Number of Fisher Scoring iterations: 5

d.new <- data.frame(年齢 = 45, 客室等級 = '2等',性別='女性')
p.hat <- predict(fit, type = 'response', newdata = d.new) 

sprintf('生存確率：%2.1f％', p.hat * 100)

## [1] "生存確率：72.9％"

p.hat <- predict(fit, type = 'response', newdata = d.te)

threshold <- 0.5 

is.pred <- p.hat > threshold
is.ref <- d.te$生死 == 1

table(予測値 = is.pred, 真値 = is.ref)

##        真値
## 予測値 FALSE TRUE
##   FALSE    70   17
##   TRUE     15   41

library(caret)

##  要求されたパッケージ ggplot2 をロード中です

##  要求されたパッケージ lattice をロード中です

confusionMatrix(data = as.factor(is.pred), 
                reference = as.factor(is.ref))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE    70   17
##      TRUE     15   41
##                                         
##                Accuracy : 0.776         
##                  95% CI : (0.699, 0.842)
##     No Information Rate : 0.594         
##     P-Value [Acc > NIR] : 3.38e-06      
##                                         
##                   Kappa : 0.533         
##                                         
##  Mcnemar's Test P-Value : 0.86          
##                                         
##             Sensitivity : 0.824         
##             Specificity : 0.707         
##          Pos Pred Value : 0.805         
##          Neg Pred Value : 0.732         
##              Prevalence : 0.594         
##          Detection Rate : 0.490         
##    Detection Prevalence : 0.608         
##       Balanced Accuracy : 0.765         
##                                         
##        'Positive' Class : FALSE         
##

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
##  次のパッケージを付け加えます: 'pROC'

##  以下のオブジェクトは 'package:stats' からマスクされています:
## 
##     cov, smooth, var

roc1 <- roc(response = d.te$生死, predict = p.hat,
            of = 'thresholds', thresholds = 'best', print.thres = 'best',
            percent = F, plot = T, print.auc = T, grid = T, ci = T, auc.polygon=T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

coords(roc1, 'best')

##   threshold specificity sensitivity
## 1      0.45        0.81        0.78

thresholdbest <- coords(roc1, 'best', ret = 'threshold')
thresholdbest

##   threshold
## 1      0.45

c <- coords(roc1) 
library(DT)
datatable(round(c, 3))

#運賃は生死に関係ありません #2等客室にいる45才の女性の生存確率は72.9%

機械学習

23150161

2024-09-17