Chapter 16 Trees

Section 16.1 Regression Trees

資料: ozone :A study the relationship between atmospheric ozone concentration and meteorology in the Los Angeles Basin in 1976. A number of cases with missing variables have been removed for simplicity.

讀入資料

library(faraway)
data(ozone)
#?ozone

Tree 分析套件: rpart

#install.packages("rpart")
library(rpart)
## 
## 載入套件:'rpart'
## 下列物件被遮斷自 'package:faraway':
## 
##     solder
(tmod<-rpart(O3~.,ozone))
## n= 330 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 330 21115.4100 11.775760  
##    2) temp< 67.5 214  4114.3040  7.425234  
##      4) ibh>=3573.5 108   689.6296  5.148148 *
##      5) ibh< 3573.5 106  2294.1230  9.745283  
##       10) dpg< -9.5 35   362.6857  6.457143 *
##       11) dpg>=-9.5 71  1366.4790 11.366200  
##         22) ibt< 159 40   287.9000  9.050000 *
##         23) ibt>=159 31   587.0968 14.354840 *
##    3) temp>=67.5 116  5478.4400 19.801720  
##      6) ibt< 226.5 55  1276.8360 15.945450  
##       12) humidity< 59.5 10   167.6000 10.800000 *
##       13) humidity>=59.5 45   785.6444 17.088890 *
##      7) ibt>=226.5 61  2646.2620 23.278690  
##       14) doy>=306.5 8   398.0000 16.000000 *
##       15) doy< 306.5 53  1760.4530 24.377360  
##         30) vis>=55 36  1149.8890 22.944440 *
##         31) vis< 55 17   380.1176 27.411760 *

第一個 split 為 temp 變數 (溫度),切點為 67.5; 項目內容依序為 (“node 編號”, “變數+切點”, “node 內樣本數”, “deviance = RSS”,“node 預測值”)

*代表 terminal node

plot(tmod)
text(tmod)

plot(tmod, compress=T, uniform=T, branch=.4)
text(tmod)

#install.packages("rpart.plot")
library(rpart.plot)
## Warning: 套件 'rpart.plot' 是用 R 版本 4.3.2 來建造的
prp(tmod,         # 模型
    faclen=0,           # 呈現的變數不要縮寫
    fallen.leaves=TRUE, # 讓樹枝以垂直方式呈現
    shadow.col="gray",  # 最下面的節點塗上陰影
    # number of correct classifications / number of observations in that node
    extra=1)  

plot(jitter(predict(tmod)),residuals(tmod),xlab="Fitted",ylab="Residuals")
abline(h=0)

利用 Tree model 預測新樣本 (新樣本的covariate 為 median)

(x0<- apply(ozone[,-1],2,median))
##       vh     wind humidity     temp      ibh      dpg      ibt      vis 
##   5760.0      5.0     64.0     62.0   2112.5     24.0    167.5    120.0 
##      doy 
##    205.5
predict(tmod,data.frame(t(x0)))
##        1 
## 14.35484

Section 16.2 Tree Pruning

set.seed(123)
tmode <-rpart(O3~., ozone, cp=0.001)
printcp(tmode)
## 
## Regression tree:
## rpart(formula = O3 ~ ., data = ozone, cp = 0.001)
## 
## Variables actually used in tree construction:
## [1] doy      dpg      humidity ibh      ibt      temp     vh       vis     
## 
## Root node error: 21115/330 = 63.986
## 
## n= 330 
## 
##           CP nsplit rel error  xerror     xstd
## 1  0.5456993      0   1.00000 1.00296 0.076477
## 2  0.0736591      1   0.45430 0.50019 0.043274
## 3  0.0535415      2   0.38064 0.45752 0.041780
## 4  0.0267557      3   0.32710 0.39790 0.038133
## 5  0.0232760      4   0.30034 0.41249 0.039340
## 6  0.0231021      5   0.27707 0.41252 0.039022
## 7  0.0153249      6   0.25397 0.39159 0.039024
## 8  0.0109137      7   0.23864 0.38880 0.038336
## 9  0.0070746      8   0.22773 0.36305 0.034622
## 10 0.0059918      9   0.22065 0.35872 0.033802
## 11 0.0059317     10   0.21466 0.35754 0.034008
## 12 0.0049709     12   0.20280 0.36274 0.035600
## 13 0.0047996     15   0.18789 0.36498 0.036045
## 14 0.0044712     16   0.18309 0.36910 0.036391
## 15 0.0031921     17   0.17861 0.36760 0.036586
## 16 0.0022152     19   0.17223 0.37110 0.037023
## 17 0.0020733     20   0.17002 0.37314 0.037073
## 18 0.0020297     22   0.16587 0.37261 0.036929
## 19 0.0014432     23   0.16384 0.37363 0.036966
## 20 0.0011322     24   0.16240 0.37209 0.037019
## 21 0.0011035     25   0.16126 0.37043 0.037059
## 22 0.0010000     26   0.16016 0.37004 0.037069
plotcp(tmode)