【回到課程大綱】

決策樹(Decision Tree)

# 記得要給定資料所在的路徑(path)，例如：我把下載的資料放在C槽下：
load("C:/titanic.raw.rdata")  #匯入.rdata檔
require(rpart)

# 先把資料區分成 train=0.8, test=0.2
set.seed(22)
train.index <- sample(x=1:nrow(titanic.raw), size=ceiling(0.8*nrow(titanic.raw) ))
train <- titanic.raw[train.index, ]
test <- titanic.raw[-train.index, ]

# CART的模型：把存活與否的變數(Survived)當作Y，剩下的變數當作X
cart.model<- rpart(Survived ~. ,
data=train)

# 輸出各節點的細部資訊(呈現在console視窗)
cart.model
## n= 1761
##
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
##
##  1) root 1761 558 No (0.68313458 0.31686542)
##    2) Sex=Male 1398 288 No (0.79399142 0.20600858)
##      4) Age=Adult 1348 264 No (0.80415430 0.19584570) *
##      5) Age=Child 50  24 No (0.52000000 0.48000000)
##       10) Class=3rd 37  11 No (0.70270270 0.29729730) *
##       11) Class=1st,2nd 13   0 Yes (0.00000000 1.00000000) *
##    3) Sex=Female 363  93 Yes (0.25619835 0.74380165)
##      6) Class=3rd 155  73 No (0.52903226 0.47096774) *
##      7) Class=1st,2nd,Crew 208  11 Yes (0.05288462 0.94711538) *

require(rpart.plot)
prp(cart.model,         # 模型
faclen=0,           # 呈現的變數不要縮寫
fallen.leaves=TRUE, # 讓樹枝以垂直方式呈現
# number of correct classifications / number of observations in that node
extra=2)  

(最下面節點的數字，代表：number of correct classifications / number of observations in that node)

即使是女性，可是擁有的艙位若是最低下的(3rd)，則大概有一半的死亡機率(82/155=53%)；

當你是男性成人時，大概有八成機率會死(1084/1348=77%)  

若是男性小孩，就和艙位等級有關：高級艙位的小孩全都獲救(13/13)，可是低艙位的小孩有七成機率(26/37=70%)會死。  

(男生好可憐)

●也可用另一個繪圖套件partykit，函式是as.party()plot()

require(partykit)
rparty.tree <- as.party(cart.model) # 轉換cart決策樹
rparty.tree # 輸出各節點的細部資訊
##
## Model formula:
## Survived ~ Class + Sex + Age
##
## Fitted party:
## [1] root
## |   [2] Sex in Male
## |   |   [3] Age in Adult: No (n = 1348, err = 19.6%)
## |   |   [4] Age in Child
## |   |   |   [5] Class in 3rd: No (n = 37, err = 29.7%)
## |   |   |   [6] Class in 1st, 2nd: Yes (n = 13, err = 0.0%)
## |   [7] Sex in Female
## |   |   [8] Class in 3rd: No (n = 155, err = 47.1%)
## |   |   [9] Class in 1st, 2nd, Crew: Yes (n = 208, err = 5.3%)
##
## Number of inner nodes:    4
## Number of terminal nodes: 5
plot(rparty.tree) 

pred <- predict(cart.model, newdata=test, type="class")

# 用table看預測的情況
table(real=test$Survived, predict=pred) ## predict ## real No Yes ## No 278 9 ## Yes 93 60 # 計算預測準確率 = 對角線的數量/總數量 confus.matrix <- table(real=test$Survived, predict=pred)
sum(diag(confus.matrix))/sum(confus.matrix) # 對角線的數量/總數量
## [1] 0.7681818

printcp(cart.model) # 先觀察未修剪的樹，CP欄位代表樹的成本複雜度參數
##
## Classification tree:
## rpart(formula = Survived ~ ., data = train)
##
## Variables actually used in tree construction:
## [1] Age   Class Sex
##
## Root node error: 558/1761 = 0.31687
##
## n= 1761
##
##         CP nsplit rel error  xerror     xstd
## 1 0.317204      0   1.00000 1.00000 0.034989
## 2 0.016129      1   0.68280 0.68280 0.030966
## 3 0.011649      2   0.66667 0.68817 0.031054
## 4 0.010000      4   0.64337 0.66487 0.030668
plotcp(cart.model) # 畫圖觀察未修剪的樹

prunetree_cart.model <- prune(cart.model, cp = cart.model$cptable[which.min(cart.model$cptable[,"xerror"]),"CP"]) # 利用能使決策樹具有最小誤差的CP來修剪樹

prunetree_pred <- predict(prunetree_cart.model, newdata=test, type="class")

# 用table看預測的情況
table(real=test$Survived, predict=prunetree_pred) ## predict ## real No Yes ## No 278 9 ## Yes 93 60 prunetree_confus.matrix <- table(real=test$Survived, predict=prunetree_pred)
sum(diag(prunetree_confus.matrix))/sum(prunetree_confus.matrix) # 對角線的數量/總數量
## [1] 0.7681818

require(caret)
require(e1071)
train_control <- trainControl(method="cv", number=10)
train_control.model <- train(Survived~., data=train, method="rpart", trControl=train_control)
train_control.model
## CART
##
## 1761 samples
##    3 predictor
##    2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1585, 1584, 1584, 1585, 1586, 1585, ...
## Resampling results across tuning parameters:
##
##   cp          Accuracy   Kappa
##   0.01164875  0.7796983  0.4045009
##   0.01612903  0.7774223  0.4055445
##   0.31720430  0.7097437  0.1417312
##
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.01164875.