決定木

d <- na.omit(read.csv('https://stats.dip.jp/01_ds/data/titanic_data_jp.csv'))
n <- nrow(d)

library(DT)
datatable(d, options = list(pageLength = 5))

COL <- c(rgb(255,   0,   0,  105, max = 255), 
         rgb(  0,   0, 255,  105, max = 255), 
         rgb(  0, 155,   0,  105, max = 255), 
         rgb(100, 100, 100,   55, max = 255)) 

library(rpart)
library(rpart.plot)
tree <- rpart(生死 ~ 年齢 + 性別 + 客室等級 + 運賃 + 兄弟配偶者数 + 親子数 + 乗船地,
              data = d, method = 'class', cp = 0.09)
rpart.plot(tree, type = 5)

rpart.plot(tree, branch.type = 5)

plotcp(tree)

plotcp(tree)

tree2 <- prune(tree, cp = 0.9)

rpart.plot(tree2, branch.type = 5)

tree3 <- prune(tree, cp = 0.09)

rpart.plot(tree3, branch.type = 5)

rpart.predict(tree, rules = T,
              newdata = data.frame(年齢         = 5,
                                   性別         = '女性', 
                                   客室等級     = '3等', 
                                   運賃         = 10,
                                   兄弟配偶者数 = 5,
                                   親子数       = 7,
                                   乗船地       = 'S'))

##        死亡      生存                     
## 1 0.2452107 0.7547893 because 性別 is 女性

d.new <- data.frame(年齢         = 5,
                    性別         = '女性', 
                    客室等級     = '3等', 
                    運賃         = 10,
                    兄弟配偶者数 = 5,
                    親子数       = 7,
                    乗船地       = 'S')

rpart.predict(tree, rules = T, newdata = d.new)

##        死亡      生存                     
## 1 0.2452107 0.7547893 because 性別 is 女性

xs <- c(1, 2, 1, 2)
ys <- c(1, 1, 2, 2)

d0 <- data.frame(x = c(xs, xs,   xs+5, xs+5),
                 y = c(ys, ys+5, ys+3, ys+9),
                 z = c(rep(1, 4), rep(2, 4), rep(3, 4), rep(4, 4)))

matplot(x = d0$x, y = d0$y, pch = 1, cex = 2, col = COL[1])
text(d0$x, d0$y, d0$z)

tree <- rpart(z ~ x + y, data = d0, minsplit = 2, minbucket = 1)
rpart.plot(tree)

matplot(x = d0$x, y = d0$y, pch = 1, cex = 2, col = COL[1])
segments(4, 0, 4,12, lty = 3, col = COL[2])
segments(4, 8, 8, 8, lty = 3, col = COL[2])
segments(0, 4, 4, 4, lty = 3, col = COL[2])
text(d0$x, d0$y, d0$z)

plotcp(tree)

tree <- rpart(年齢 ~ 性別 + 客室等級 + 運賃 + 兄弟配偶者数 + 親子数 + 乗船地, data = d)

rpart.plot(tree, type = 5)

rpart.plot(tree, branch.type = 5)

```

決定木

大石拓真

2024-10-08