d0 <- read.csv(file = 'https://stats.dip.jp/01_ds/data/titanic_data_jp.csv')
library(DT)
datatable(d0)
summary(d0)
## 乗船番号 生死 客室等級 名前
## Length:714 Length:714 Length:714 Length:714
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## 性別 年齢 兄弟配偶者数 親子数
## Length:714 Min. : 0.42 Min. :0.0000 Min. :0.0000
## Class :character 1st Qu.:20.12 1st Qu.:0.0000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.0000 Median :0.0000
## Mean :29.70 Mean :0.5126 Mean :0.4314
## 3rd Qu.:38.00 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :80.00 Max. :5.0000 Max. :6.0000
## 運賃 乗船地
## Min. : 0.00 Length:714
## 1st Qu.: 8.05 Class :character
## Median : 15.74 Mode :character
## Mean : 34.69
## 3rd Qu.: 33.38
## Max. :512.33
head(d0)
tail(d0)
(n <- nrow(d0))
## [1] 714
d <- d0[,-1][,-3]
d$生死 <- ifelse(d$生死 == '生存', 1, 0)
datatable(d)
summary(d)
## 生死 客室等級 性別 年齢
## Min. :0.0000 Length:714 Length:714 Min. : 0.42
## 1st Qu.:0.0000 Class :character Class :character 1st Qu.:20.12
## Median :0.0000 Mode :character Mode :character Median :28.00
## Mean :0.4062 Mean :29.70
## 3rd Qu.:1.0000 3rd Qu.:38.00
## Max. :1.0000 Max. :80.00
## 兄弟配偶者数 親子数 運賃 乗船地
## Min. :0.0000 Min. :0.0000 Min. : 0.00 Length:714
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 8.05 Class :character
## Median :0.0000 Median :0.0000 Median : 15.74 Mode :character
## Mean :0.5126 Mean :0.4314 Mean : 34.69
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 33.38
## Max. :5.0000 Max. :6.0000 Max. :512.33
d$性別 <- ifelse(d$性別 == '男性', 1, 0)
d$客室等級 <- ifelse(d$客室等級 == '1等', 1,ifelse(d$客室等級=='2等', 2, 3))
d1 <- na.omit(d)
datatable(d1)
summary(d1)
## 生死 客室等級 性別 年齢
## Min. :0.0000 Min. :1.000 Min. :0.0000 Min. : 0.42
## 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:20.12
## Median :0.0000 Median :2.000 Median :1.0000 Median :28.00
## Mean :0.4062 Mean :2.237 Mean :0.6345 Mean :29.70
## 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.0000 3rd Qu.:38.00
## Max. :1.0000 Max. :3.000 Max. :1.0000 Max. :80.00
## 兄弟配偶者数 親子数 運賃 乗船地
## Min. :0.0000 Min. :0.0000 Min. : 0.00 Length:714
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 8.05 Class :character
## Median :0.0000 Median :0.0000 Median : 15.74 Mode :character
## Mean :0.5126 Mean :0.4314 Mean : 34.69
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 33.38
## Max. :5.0000 Max. :6.0000 Max. :512.33
set.seed(5)
library(rsample)
d.trte <- initial_split(d1, prop = 4/5, strata = 生死)
d.trte
## <Training/Testing/Total>
## <571/143/714>
d.tr <- training(d.trte)
d.te <- testing (d.trte)
nrow(d.tr)
## [1] 571
nrow(d.te)
## [1] 143
fit.all <- glm(生死 ~ ., data = d.tr, family = 'binomial')
summary(fit.all)
##
## Call:
## glm(formula = 生死 ~ ., family = "binomial", data = d.tr)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.787e+01 6.095e+02 0.029 0.9766
## 客室等級 -1.232e+00 1.823e-01 -6.755 1.43e-11 ***
## 性別 -2.728e+00 2.519e-01 -10.828 < 2e-16 ***
## 年齢 -4.098e-02 9.057e-03 -4.524 6.06e-06 ***
## 兄弟配偶者数 -3.258e-01 1.408e-01 -2.313 0.0207 *
## 親子数 -6.688e-02 1.366e-01 -0.490 0.6244
## 運賃 6.083e-04 2.478e-03 0.245 0.8061
## 乗船地C -1.237e+01 6.095e+02 -0.020 0.9838
## 乗船地Q -1.283e+01 6.095e+02 -0.021 0.9832
## 乗船地S -1.252e+01 6.095e+02 -0.021 0.9836
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 771.40 on 570 degrees of freedom
## Residual deviance: 499.78 on 561 degrees of freedom
## AIC: 519.78
##
## Number of Fisher Scoring iterations: 13
運賃は生死を分ける要素ではない
fit <- glm(生死 ~ 客室等級 + 性別 + 年齢, data = d.tr, family = 'binomial')
summary(fit)
##
## Call:
## glm(formula = 生死 ~ 客室等級 + 性別 + 年齢, family = "binomial",
## data = d.tr)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.053870 0.564456 8.954 < 2e-16 ***
## 客室等級 -1.279224 0.156612 -8.168 3.13e-16 ***
## 性別 -2.622552 0.235270 -11.147 < 2e-16 ***
## 年齢 -0.035297 0.008464 -4.170 3.04e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 771.40 on 570 degrees of freedom
## Residual deviance: 508.68 on 567 degrees of freedom
## AIC: 516.68
##
## Number of Fisher Scoring iterations: 5
fit1 <- glm(生死 ~ 客室等級 + 性別 + 年齢 + 兄弟配偶者数, data = d.tr, family = 'binomial')
summary(fit1)
##
## Call:
## glm(formula = 生死 ~ 客室等級 + 性別 + 年齢 + 兄弟配偶者数,
## family = "binomial", data = d.tr)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.501858 0.599405 9.179 < 2e-16 ***
## 客室等級 -1.293050 0.157420 -8.214 < 2e-16 ***
## 性別 -2.707763 0.242207 -11.180 < 2e-16 ***
## 年齢 -0.041492 0.008946 -4.638 3.51e-06 ***
## 兄弟配偶者数 -0.346223 0.131839 -2.626 0.00864 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 771.40 on 570 degrees of freedom
## Residual deviance: 501.13 on 566 degrees of freedom
## AIC: 511.13
##
## Number of Fisher Scoring iterations: 5
2等客室にいる45才の女性
d.new <- data.frame(年齢 = 45, 客室等級 = 2,性別 = 0)
p.hat <- predict(fit, type = 'response', newdata = d.new)
sprintf('生存確率:%2.1f%', p.hat * 100)
## [1] "生存確率:71.2%"
p.hat <- predict(fit1, type = 'response', newdata = d.te)
threshold <- 0.5 # 閾値(しきいち),カットオフ値(cut-off)とも呼ばれる。
is.pred <- p.hat > threshold
is.ref <- d.te$生死 == 1
table(予測値 = is.pred, 真値 = is.ref)
## 真値
## 予測値 FALSE TRUE
## FALSE 71 15
## TRUE 14 43
is.ok <- is.pred == is.ref
n.ok <- sum(is.ok)
sprintf('生死予測精度:%2.1f%', n.ok / nrow(d.te) * 100)
## [1] "生死予測精度:79.7%"
library(caret)
confusionMatrix(data = as.factor(is.pred),
reference = as.factor(is.ref))
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 71 15
## TRUE 14 43
##
## Accuracy : 0.7972
## 95% CI : (0.7219, 0.8598)
## No Information Rate : 0.5944
## P-Value [Acc > NIR] : 2.035e-07
##
## Kappa : 0.5783
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8353
## Specificity : 0.7414
## Pos Pred Value : 0.8256
## Neg Pred Value : 0.7544
## Prevalence : 0.5944
## Detection Rate : 0.4965
## Detection Prevalence : 0.6014
## Balanced Accuracy : 0.7883
##
## 'Positive' Class : FALSE
##
library(pROC)
roc1 <- roc(response = d.te$生死, predict = p.hat,
of = 'thresholds', thresholds = 'best', print.thres = 'best',
percent = F, plot = T, print.auc = T, grid = T, ci = T, auc.polygon=T)

coords(roc1, 'best')
thresholdbest <- coords(roc1, 'best', ret = 'threshold')
thresholdbest
c <- coords(roc1)
library(DT)
datatable(round(c, 3))