ロジスティック回帰２.knit

options(digits = 2) 
d0 <- read.csv(file = 'https://stats.dip.jp/01_ds/data/titanic_data_jp.csv')
library(DT)
datatable(d0)

d0$生死 <-ifelse(d0$生死　== '生存',1,0)
d0$性別 <-ifelse(d0$性別　== '性別',1,0)
head(d0)

##   乗船番号 生死 客室等級      名前 性別 年齢 兄弟配偶者数 親子数 運賃 乗船地
## 1     No.1    0      3等    Braund    0   22            1      0  7.2      S
## 2     No.2    1      1等   Cumings    0   38            1      0 71.3      C
## 3     No.3    1      3等 Heikkinen    0   26            0      0  7.9      S
## 4     No.4    1      1等  Futrelle    0   35            1      0 53.1      S
## 5     No.5    0      3等     Allen    0   35            0      0  8.1      S
## 6     No.7    0      1等  McCarthy    0   54            0      0 51.9      S

tail(d0)

##     乗船番号 生死 客室等級     名前 性別 年齢 兄弟配偶者数 親子数 運賃 乗船地
## 709   No.885    0      3等 Sutehall    0   25            0      0  7.0      S
## 710   No.886    0      3等     Rice    0   39            0      5 29.1      Q
## 711   No.887    0      2等 Montvila    0   27            0      0 13.0      S
## 712   No.888    1      1等   Graham    0   19            0      0 30.0      S
## 713   No.890    1      1等     Behr    0   26            0      0 30.0      C
## 714   No.891    0      3等   Dooley    0   32            0      0  7.8      Q

(n <- nrow(d0))

## [1] 714

set.seed(5)

library(rsample)
d.trte <- initial_split(d0, prop = 4/5, strata = 生死)
d.trte

## <Training/Testing/Total>
## <571/143/714>

d.tr <- training(d.trte) # 訓練データ
d.te <- testing (d.trte) # 試験データ
fit.all <- glm( 生死~ 運賃, data = d.tr, family = 'binomial')
summary(fit.all)

## 
## Call:
## glm(formula = 生死 ~ 運賃, family = "binomial", data = d.tr)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.83270    0.11700   -7.12  1.1e-12 ***
## 運賃         0.01370    0.00259    5.29  1.2e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 771.40  on 570  degrees of freedom
## Residual deviance: 727.74  on 569  degrees of freedom
## AIC: 731.7
## 
## Number of Fisher Scoring iterations: 5

fit <- glm(生死 ~ 客室等級 + 年齢 , data = d.tr, family = 'binomial')

summary(fit)

## 
## Call:
## glm(formula = 生死 ~ 客室等級 + 年齢, family = "binomial", 
##     data = d.tr)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.19078    0.34773    6.30  3.0e-10 ***
## 客室等級2等 -1.00569    0.26612   -3.78  0.00016 ***
## 客室等級3等 -2.42101    0.26493   -9.14  < 2e-16 ***
## 年齢        -0.04003    0.00736   -5.44  5.4e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 771.40  on 570  degrees of freedom
## Residual deviance: 661.48  on 567  degrees of freedom
## AIC: 669.5
## 
## Number of Fisher Scoring iterations: 4

d.new <- data.frame(年齢 = 45, 性別 = 1,客室等級 = '2等')
p.hat <- predict(fit, type = 'response', newdata = d.new) 

sprintf('生存確率：%2.1f％', p.hat * 100)

## [1] "生存確率：35.1％"

p.hat <- predict(fit, type = 'response', newdata = d.te)

threshold <- 0.5 

is.pred <- p.hat > threshold
is.ref <- d.te$生死 == 1

table(予測値 = is.pred, 真値 = is.ref)

##        真値
## 予測値 FALSE TRUE
##   FALSE    67   26
##   TRUE     18   32

is.ok <- is.pred == is.ref
n.ok <- sum(is.ok)

sprintf('生死予測精度：%2.1f％', n.ok / nrow(d.te) * 100)

## [1] "生死予測精度：69.2％"

library(caret)

##  要求されたパッケージ ggplot2 をロード中です

##  要求されたパッケージ lattice をロード中です

confusionMatrix(data = as.factor(is.pred), 
                reference = as.factor(is.ref))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE    67   26
##      TRUE     18   32
##                                        
##                Accuracy : 0.692        
##                  95% CI : (0.61, 0.767)
##     No Information Rate : 0.594        
##     P-Value [Acc > NIR] : 0.00992      
##                                        
##                   Kappa : 0.348        
##                                        
##  Mcnemar's Test P-Value : 0.29129      
##                                        
##             Sensitivity : 0.788        
##             Specificity : 0.552        
##          Pos Pred Value : 0.720        
##          Neg Pred Value : 0.640        
##              Prevalence : 0.594        
##          Detection Rate : 0.469        
##    Detection Prevalence : 0.650        
##       Balanced Accuracy : 0.670        
##                                        
##        'Positive' Class : FALSE        
##

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
##  次のパッケージを付け加えます: 'pROC'

##  以下のオブジェクトは 'package:stats' からマスクされています:
## 
##     cov, smooth, var

roc1 <- roc(response = d.te$生死, predict = p.hat,
            of = 'thresholds', thresholds = 'best', print.thres = 'best',
            percent = F, plot = T, print.auc = T, grid = T, ci = T, auc.polygon=T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

coords(roc1, 'best')

##   threshold specificity sensitivity
## 1      0.44        0.71        0.66

c <- coords(roc1) 

library(DT)
datatable(round(c, 3))