d0 <- read.csv(file = 'https://stats.dip.jp/01_ds/data/titanic_data_jp.csv')
library(DT)
datatable(d0)
summary(d0)
##    乗船番号             生死             客室等級             名前          
##  Length:714         Length:714         Length:714         Length:714        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      性別                年齢        兄弟配偶者数        親子数      
##  Length:714         Min.   : 0.42   Min.   :0.0000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.0000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.0000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.5126   Mean   :0.4314  
##                     3rd Qu.:38.00   3rd Qu.:1.0000   3rd Qu.:1.0000  
##                     Max.   :80.00   Max.   :5.0000   Max.   :6.0000  
##       運賃           乗船地         
##  Min.   :  0.00   Length:714        
##  1st Qu.:  8.05   Class :character  
##  Median : 15.74   Mode  :character  
##  Mean   : 34.69                     
##  3rd Qu.: 33.38                     
##  Max.   :512.33
head(d0)
tail(d0)
(n <- nrow(d0))
## [1] 714
d <- d0[,-1][,-3]
d$生死 <- ifelse(d$生死 == '生存', 1, 0)
datatable(d)
summary(d)
##       生死          客室等級             性別                年齢      
##  Min.   :0.0000   Length:714         Length:714         Min.   : 0.42  
##  1st Qu.:0.0000   Class :character   Class :character   1st Qu.:20.12  
##  Median :0.0000   Mode  :character   Mode  :character   Median :28.00  
##  Mean   :0.4062                                         Mean   :29.70  
##  3rd Qu.:1.0000                                         3rd Qu.:38.00  
##  Max.   :1.0000                                         Max.   :80.00  
##   兄弟配偶者数        親子数            運賃           乗船地         
##  Min.   :0.0000   Min.   :0.0000   Min.   :  0.00   Length:714        
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:  8.05   Class :character  
##  Median :0.0000   Median :0.0000   Median : 15.74   Mode  :character  
##  Mean   :0.5126   Mean   :0.4314   Mean   : 34.69                     
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.: 33.38                     
##  Max.   :5.0000   Max.   :6.0000   Max.   :512.33
d$性別 <- ifelse(d$性別 == '男性', 1, 0)
d$客室等級 <- ifelse(d$客室等級 == '1等', 1,ifelse(d$客室等級=='2等', 2, 3))
d1 <- na.omit(d)

datatable(d1)
summary(d1)
##       生死           客室等級          性別             年齢      
##  Min.   :0.0000   Min.   :1.000   Min.   :0.0000   Min.   : 0.42  
##  1st Qu.:0.0000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:20.12  
##  Median :0.0000   Median :2.000   Median :1.0000   Median :28.00  
##  Mean   :0.4062   Mean   :2.237   Mean   :0.6345   Mean   :29.70  
##  3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.0000   3rd Qu.:38.00  
##  Max.   :1.0000   Max.   :3.000   Max.   :1.0000   Max.   :80.00  
##   兄弟配偶者数        親子数            運賃           乗船地         
##  Min.   :0.0000   Min.   :0.0000   Min.   :  0.00   Length:714        
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:  8.05   Class :character  
##  Median :0.0000   Median :0.0000   Median : 15.74   Mode  :character  
##  Mean   :0.5126   Mean   :0.4314   Mean   : 34.69                     
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.: 33.38                     
##  Max.   :5.0000   Max.   :6.0000   Max.   :512.33
set.seed(5)
library(rsample)
d.trte <- initial_split(d1, prop = 4/5, strata = 生死)
d.trte
## <Training/Testing/Total>
## <571/143/714>
d.tr <- training(d.trte)
d.te <- testing (d.trte)
nrow(d.tr)
## [1] 571
nrow(d.te)
## [1] 143
fit.all <- glm(生死 ~ ., data = d.tr, family = 'binomial')
summary(fit.all)
## 
## Call:
## glm(formula = 生死 ~ ., family = "binomial", data = d.tr)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.787e+01  6.095e+02   0.029   0.9766    
## 客室等級     -1.232e+00  1.823e-01  -6.755 1.43e-11 ***
## 性別         -2.728e+00  2.519e-01 -10.828  < 2e-16 ***
## 年齢         -4.098e-02  9.057e-03  -4.524 6.06e-06 ***
## 兄弟配偶者数 -3.258e-01  1.408e-01  -2.313   0.0207 *  
## 親子数       -6.688e-02  1.366e-01  -0.490   0.6244    
## 運賃          6.083e-04  2.478e-03   0.245   0.8061    
## 乗船地C      -1.237e+01  6.095e+02  -0.020   0.9838    
## 乗船地Q      -1.283e+01  6.095e+02  -0.021   0.9832    
## 乗船地S      -1.252e+01  6.095e+02  -0.021   0.9836    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 771.40  on 570  degrees of freedom
## Residual deviance: 499.78  on 561  degrees of freedom
## AIC: 519.78
## 
## Number of Fisher Scoring iterations: 13

運賃は生死を分ける要素ではない

fit <- glm(生死 ~ 客室等級 + 性別 + 年齢, data = d.tr, family = 'binomial')
summary(fit)
## 
## Call:
## glm(formula = 生死 ~ 客室等級 + 性別 + 年齢, family = "binomial", 
##     data = d.tr)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  5.053870   0.564456   8.954  < 2e-16 ***
## 客室等級    -1.279224   0.156612  -8.168 3.13e-16 ***
## 性別        -2.622552   0.235270 -11.147  < 2e-16 ***
## 年齢        -0.035297   0.008464  -4.170 3.04e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 771.40  on 570  degrees of freedom
## Residual deviance: 508.68  on 567  degrees of freedom
## AIC: 516.68
## 
## Number of Fisher Scoring iterations: 5
fit1 <- glm(生死 ~ 客室等級 + 性別 + 年齢 + 兄弟配偶者数, data = d.tr, family = 'binomial')
summary(fit1)
## 
## Call:
## glm(formula = 生死 ~ 客室等級 + 性別 + 年齢 + 兄弟配偶者数, 
##     family = "binomial", data = d.tr)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   5.501858   0.599405   9.179  < 2e-16 ***
## 客室等級     -1.293050   0.157420  -8.214  < 2e-16 ***
## 性別         -2.707763   0.242207 -11.180  < 2e-16 ***
## 年齢         -0.041492   0.008946  -4.638 3.51e-06 ***
## 兄弟配偶者数 -0.346223   0.131839  -2.626  0.00864 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 771.40  on 570  degrees of freedom
## Residual deviance: 501.13  on 566  degrees of freedom
## AIC: 511.13
## 
## Number of Fisher Scoring iterations: 5

2等客室にいる45才の女性

d.new <- data.frame(年齢 = 45, 客室等級 = 2,性別 = 0)
p.hat <- predict(fit, type = 'response', newdata = d.new)

sprintf('生存確率:%2.1f%', p.hat * 100)
## [1] "生存確率:71.2%"
p.hat <- predict(fit1, type = 'response', newdata = d.te)
threshold <- 0.5 # 閾値(しきいち),カットオフ値(cut-off)とも呼ばれる。
is.pred <- p.hat > threshold
is.ref <- d.te$生死 == 1
table(予測値 = is.pred, 真値 = is.ref)
##        真値
## 予測値 FALSE TRUE
##   FALSE    71   15
##   TRUE     14   43
is.ok <- is.pred == is.ref
n.ok <- sum(is.ok)
sprintf('生死予測精度:%2.1f%', n.ok / nrow(d.te) * 100)
## [1] "生死予測精度:79.7%"
library(caret)
confusionMatrix(data = as.factor(is.pred), 
                reference = as.factor(is.ref))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE    71   15
##      TRUE     14   43
##                                           
##                Accuracy : 0.7972          
##                  95% CI : (0.7219, 0.8598)
##     No Information Rate : 0.5944          
##     P-Value [Acc > NIR] : 2.035e-07       
##                                           
##                   Kappa : 0.5783          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.8353          
##             Specificity : 0.7414          
##          Pos Pred Value : 0.8256          
##          Neg Pred Value : 0.7544          
##              Prevalence : 0.5944          
##          Detection Rate : 0.4965          
##    Detection Prevalence : 0.6014          
##       Balanced Accuracy : 0.7883          
##                                           
##        'Positive' Class : FALSE           
## 
library(pROC)
roc1 <- roc(response = d.te$生死, predict = p.hat,
            of = 'thresholds', thresholds = 'best', print.thres = 'best',
            percent = F, plot = T, print.auc = T, grid = T, ci = T, auc.polygon=T)

coords(roc1, 'best')
thresholdbest <- coords(roc1, 'best', ret = 'threshold')
thresholdbest
c <- coords(roc1)
library(DT)
datatable(round(c, 3))