Orange and Juice data

We will use the Orange juice data from ISLR packages, since the data has 1070 observations, we will randomize the data first and use 870 as training set and 200 as test set.

library(ISLR)
## Warning: package 'ISLR' was built under R version 3.4.2
attach(OJ)
set.seed(123)
oj_rand<- OJ[order(runif(1070)),]
oj_train<-oj_rand[1:870, ]
oj_test<-oj_rand[801: 1070,]

Tree

We will apply the tree function in tree packages and rpart in rpart packages to fit a tree to the training data, with Purchase as the response and all other variables as predictiors. Then we will come the result.

library(rpart)
library(tree)
## Warning: package 'tree' was built under R version 3.4.2
ojtree<- tree(Purchase ~ ., data=oj_train)
summary(ojtree)
## 
## Classification tree:
## tree(formula = Purchase ~ ., data = oj_train)
## Variables actually used in tree construction:
## [1] "LoyalCH"     "SalePriceMM" "SpecialCH"   "PriceDiff"  
## Number of terminal nodes:  9 
## Residual mean deviance:  0.743 = 639.7 / 861 
## Misclassification error rate: 0.1586 = 138 / 870
ojtree_rpart <-rpart(Purchase~. , data=oj_train)
summary(ojtree_rpart)
## Call:
## rpart(formula = Purchase ~ ., data = oj_train)
##   n= 870 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.51754386      0 1.0000000 1.0000000 0.04212542
## 2 0.01851852      1 0.4824561 0.5263158 0.03493615
## 3 0.01169591      4 0.4269006 0.5087719 0.03449797
## 4 0.01000000      7 0.3859649 0.5233918 0.03486421
## 
## Variable importance
##        LoyalCH    SalePriceMM      PriceDiff         DiscMM      PctDiscMM 
##             53              9              7              6              6 
##        PriceMM  ListPriceDiff        StoreID        PriceCH WeekofPurchase 
##              6              4              4              2              2 
##    SalePriceCH          STORE      SpecialCH 
##              2              1              1 
## 
## Node number 1: 870 observations,    complexity param=0.5175439
##   predicted class=CH  expected loss=0.3931034  P(node) =1
##     class counts:   528   342
##    probabilities: 0.607 0.393 
##   left son=2 (543 obs) right son=3 (327 obs)
##   Primary splits:
##       LoyalCH   < 0.482935  to the right, improve=149.35520, (0 missing)
##       StoreID   < 3.5       to the right, improve= 46.03781, (0 missing)
##       Store7    splits as  RL, improve= 22.22719, (0 missing)
##       STORE     < 0.5       to the left,  improve= 22.22719, (0 missing)
##       PriceDiff < 0.265     to the right, improve= 21.08085, (0 missing)
##   Surrogate splits:
##       StoreID     < 3.5       to the right, agree=0.643, adj=0.049, (0 split)
##       PriceMM     < 1.89      to the right, agree=0.634, adj=0.028, (0 split)
##       DiscMM      < 0.57      to the left,  agree=0.633, adj=0.024, (0 split)
##       SalePriceMM < 1.585     to the right, agree=0.633, adj=0.024, (0 split)
##       PctDiscMM   < 0.264375  to the left,  agree=0.633, adj=0.024, (0 split)
## 
## Node number 2: 543 observations,    complexity param=0.01851852
##   predicted class=CH  expected loss=0.1657459  P(node) =0.6241379
##     class counts:   453    90
##    probabilities: 0.834 0.166 
##   left son=4 (286 obs) right son=5 (257 obs)
##   Primary splits:
##       LoyalCH       < 0.7645725 to the right, improve=18.519040, (0 missing)
##       PriceDiff     < -0.39     to the right, improve=13.011600, (0 missing)
##       SalePriceMM   < 1.84      to the right, improve=10.804600, (0 missing)
##       ListPriceDiff < 0.235     to the right, improve= 8.763399, (0 missing)
##       DiscMM        < 0.03      to the left,  improve= 6.725406, (0 missing)
##   Surrogate splits:
##       WeekofPurchase < 256.5     to the right, agree=0.613, adj=0.183, (0 split)
##       PriceCH        < 1.825     to the right, agree=0.611, adj=0.179, (0 split)
##       PriceMM        < 2.04      to the right, agree=0.610, adj=0.175, (0 split)
##       StoreID        < 3.5       to the right, agree=0.608, adj=0.171, (0 split)
##       SalePriceMM    < 2.04      to the right, agree=0.586, adj=0.125, (0 split)
## 
## Node number 3: 327 observations,    complexity param=0.01169591
##   predicted class=MM  expected loss=0.2293578  P(node) =0.3758621
##     class counts:    75   252
##    probabilities: 0.229 0.771 
##   left son=6 (144 obs) right son=7 (183 obs)
##   Primary splits:
##       LoyalCH   < 0.2761415 to the right, improve=10.916000, (0 missing)
##       StoreID   < 3.5       to the right, improve= 7.090583, (0 missing)
##       SpecialCH < 0.5       to the right, improve= 6.104074, (0 missing)
##       PriceDiff < 0.31      to the right, improve= 5.817721, (0 missing)
##       Store7    splits as  RL, improve= 4.816648, (0 missing)
##   Surrogate splits:
##       STORE       < 1.5       to the left,  agree=0.648, adj=0.201, (0 split)
##       StoreID     < 1.5       to the left,  agree=0.621, adj=0.139, (0 split)
##       SpecialCH   < 0.5       to the right, agree=0.596, adj=0.083, (0 split)
##       SalePriceCH < 1.72      to the left,  agree=0.596, adj=0.083, (0 split)
##       DiscCH      < 0.145     to the right, agree=0.593, adj=0.076, (0 split)
## 
## Node number 4: 286 observations
##   predicted class=CH  expected loss=0.04195804  P(node) =0.3287356
##     class counts:   274    12
##    probabilities: 0.958 0.042 
## 
## Node number 5: 257 observations,    complexity param=0.01851852
##   predicted class=CH  expected loss=0.3035019  P(node) =0.2954023
##     class counts:   179    78
##    probabilities: 0.696 0.304 
##   left son=10 (157 obs) right son=11 (100 obs)
##   Primary splits:
##       PriceDiff     < 0.145     to the right, improve=15.345160, (0 missing)
##       ListPriceDiff < 0.235     to the right, improve=14.842860, (0 missing)
##       SalePriceMM   < 1.84      to the right, improve=12.717920, (0 missing)
##       DiscMM        < 0.03      to the left,  improve= 9.229346, (0 missing)
##       PctDiscMM     < 0.0137615 to the left,  improve= 9.229346, (0 missing)
##   Surrogate splits:
##       SalePriceMM   < 1.94      to the right, agree=0.957, adj=0.89, (0 split)
##       DiscMM        < 0.08      to the left,  agree=0.891, adj=0.72, (0 split)
##       PctDiscMM     < 0.038887  to the left,  agree=0.891, adj=0.72, (0 split)
##       ListPriceDiff < 0.18      to the right, agree=0.790, adj=0.46, (0 split)
##       PriceMM       < 2.04      to the right, agree=0.747, adj=0.35, (0 split)
## 
## Node number 6: 144 observations,    complexity param=0.01169591
##   predicted class=MM  expected loss=0.375  P(node) =0.1655172
##     class counts:    54    90
##    probabilities: 0.375 0.625 
##   left son=12 (62 obs) right son=13 (82 obs)
##   Primary splits:
##       SalePriceMM < 2.04      to the right, improve=7.821007, (0 missing)
##       PriceDiff   < 0.25      to the right, improve=7.181214, (0 missing)
##       DiscMM      < 0.22      to the left,  improve=5.062500, (0 missing)
##       PctDiscMM   < 0.0729725 to the left,  improve=4.703400, (0 missing)
##       SpecialCH   < 0.5       to the right, improve=4.182927, (0 missing)
##   Surrogate splits:
##       PriceDiff      < 0.135     to the right, agree=0.910, adj=0.790, (0 split)
##       DiscMM         < 0.08      to the left,  agree=0.819, adj=0.581, (0 split)
##       PctDiscMM      < 0.038887  to the left,  agree=0.819, adj=0.581, (0 split)
##       PriceMM        < 2.04      to the right, agree=0.792, adj=0.516, (0 split)
##       WeekofPurchase < 243.5     to the right, agree=0.708, adj=0.323, (0 split)
## 
## Node number 7: 183 observations
##   predicted class=MM  expected loss=0.1147541  P(node) =0.2103448
##     class counts:    21   162
##    probabilities: 0.115 0.885 
## 
## Node number 10: 157 observations
##   predicted class=CH  expected loss=0.1656051  P(node) =0.1804598
##     class counts:   131    26
##    probabilities: 0.834 0.166 
## 
## Node number 11: 100 observations,    complexity param=0.01851852
##   predicted class=MM  expected loss=0.48  P(node) =0.1149425
##     class counts:    48    52
##    probabilities: 0.480 0.520 
##   left son=22 (31 obs) right son=23 (69 obs)
##   Primary splits:
##       ListPriceDiff < 0.235     to the right, improve=6.164974, (0 missing)
##       DiscMM        < 0.47      to the left,  improve=4.800952, (0 missing)
##       PriceDiff     < -0.35     to the right, improve=4.800952, (0 missing)
##       PctDiscMM     < 0.227263  to the left,  improve=4.800952, (0 missing)
##       PriceCH       < 1.755     to the left,  improve=4.455519, (0 missing)
##   Surrogate splits:
##       PriceCH     < 1.755     to the left,  agree=0.78, adj=0.290, (0 split)
##       SalePriceCH < 1.755     to the left,  agree=0.78, adj=0.290, (0 split)
##       StoreID     < 5.5       to the right, agree=0.73, adj=0.129, (0 split)
##       SpecialCH   < 0.5       to the right, agree=0.73, adj=0.129, (0 split)
##       Store7      splits as  RL, agree=0.73, adj=0.129, (0 split)
## 
## Node number 12: 62 observations,    complexity param=0.01169591
##   predicted class=CH  expected loss=0.4354839  P(node) =0.07126437
##     class counts:    35    27
##    probabilities: 0.565 0.435 
##   left son=24 (52 obs) right son=25 (10 obs)
##   Primary splits:
##       SalePriceCH < 1.94      to the left,  improve=3.168486, (0 missing)
##       LoyalCH     < 0.303104  to the left,  improve=2.992962, (0 missing)
##       PriceMM     < 2.205     to the left,  improve=2.467099, (0 missing)
##       SalePriceMM < 2.205     to the left,  improve=2.467099, (0 missing)
##       PriceCH     < 1.94      to the left,  improve=2.096943, (0 missing)
##   Surrogate splits:
##       PriceMM     < 2.205     to the left,  agree=0.952, adj=0.7, (0 split)
##       SalePriceMM < 2.205     to the left,  agree=0.952, adj=0.7, (0 split)
##       PriceCH     < 1.94      to the left,  agree=0.887, adj=0.3, (0 split)
##       PriceDiff   < 0.195     to the right, agree=0.871, adj=0.2, (0 split)
##       STORE       < 2.5       to the left,  agree=0.871, adj=0.2, (0 split)
## 
## Node number 13: 82 observations
##   predicted class=MM  expected loss=0.2317073  P(node) =0.09425287
##     class counts:    19    63
##    probabilities: 0.232 0.768 
## 
## Node number 22: 31 observations
##   predicted class=CH  expected loss=0.2580645  P(node) =0.03563218
##     class counts:    23     8
##    probabilities: 0.742 0.258 
## 
## Node number 23: 69 observations
##   predicted class=MM  expected loss=0.3623188  P(node) =0.07931034
##     class counts:    25    44
##    probabilities: 0.362 0.638 
## 
## Node number 24: 52 observations
##   predicted class=CH  expected loss=0.3653846  P(node) =0.05977011
##     class counts:    33    19
##    probabilities: 0.635 0.365 
## 
## Node number 25: 10 observations
##   predicted class=MM  expected loss=0.2  P(node) =0.01149425
##     class counts:     2     8
##    probabilities: 0.200 0.800

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot

As we can see the rpart() function provides us a lot more info.In this Particular data set rpart() and tree did not make much differences, but it can result in very different trees generally speaking.

Find the error rate from rpart

printcp(ojtree_rpart)
## 
## Classification tree:
## rpart(formula = Purchase ~ ., data = oj_train)
## 
## Variables actually used in tree construction:
## [1] ListPriceDiff LoyalCH       PriceDiff     SalePriceCH   SalePriceMM  
## 
## Root node error: 342/870 = 0.3931
## 
## n= 870 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.517544      0   1.00000 1.00000 0.042125
## 2 0.018519      1   0.48246 0.52632 0.034936
## 3 0.011696      4   0.42690 0.50877 0.034498
## 4 0.010000      7   0.38596 0.52339 0.034864

Our error rate is 0.39*0.4824(first node error)= 0.188 which is very closs to the tree() output

Visiualization of tree

oj_pred <- predict(ojtree, oj_test, type="class")
table(oj_test$Purchase, oj_pred)
##     oj_pred
##       CH  MM
##   CH 153  17
##   MM  24  76

the number that the tree predicted wrong is 41/270 =0.1518, close to 18% to the training error.

cvoj<- cv.tree(ojtree, FUN=prune.tree)
cvoj
## $size
## [1] 9 8 7 6 5 4 3 2 1
## 
## $dev
## [1]  794.7491  788.2223  788.0953  802.8324  799.0773  809.0060  808.8779
## [8]  865.1938 1167.9557
## 
## $k
## [1]      -Inf  12.11869  13.03122  14.90836  16.83680  31.22749  39.44198
## [8]  72.60695 326.12678
## 
## $method
## [1] "deviance"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"

So we see the optimal size is 7 nodes, where the minimun of std.

plot(cvoj$size, cvoj$dev, type="b" , xlab="Tree Size", ylab="Deviance")

as we can see from the graph, indeed node 7 is the min.

Create 7 nodes pruned tree for optimal result.

ojpruned <-prune.tree(ojtree, best=7)
summary(ojpruned)
## 
## Classification tree:
## snip.tree(tree = ojtree, nodes = c(10L, 4L))
## Variables actually used in tree construction:
## [1] "LoyalCH"     "SalePriceMM" "PriceDiff"  
## Number of terminal nodes:  7 
## Residual mean deviance:  0.7704 = 664.9 / 863 
## Misclassification error rate: 0.1621 = 141 / 870

now the misclassification error is 0.16 which is a little higher than unpruned trees.

Compare the pruned and unpruned tree

training error

pred_unprune <-predict(ojtree, oj_train, type="class")
mis_unprune<- sum(oj_train$Purchase != pred_unprune)
mis_unprune/length(pred_unprune)
## [1] 0.1586207
pred_prune <-predict(ojpruned, oj_train, type="class")
mis_prune<- sum(oj_train$Purchase != pred_prune)
mis_prune/length(pred_prune)
## [1] 0.162069

pruned tree has a little higher error rate

test error

pred_unprune <-predict(ojtree, oj_test, type="class")
mis_unprune<- sum(oj_test$Purchase != pred_unprune)
mis_unprune/length(pred_unprune)
## [1] 0.1518519
pred_prune <-predict(ojpruned, oj_test, type="class")
mis_prune<- sum(oj_test$Purchase != pred_prune)
mis_prune/length(pred_prune)
## [1] 0.1555556

so pruned tree has a little higher error rate again. Note when k >3 then the deviance doesn not vary much in general