Lab 8: Decision Trees
Fitting Classification Trees
library(tree)
library(ISLR)
attach(Carseats)
High = ifelse(Sales <= 8, "No", "Yes")
Carseats = data.frame(Carseats, High)
tree.carseats = tree(High ~ . -Sales, Carseats)
summary(tree.carseats)
Classification tree:
tree(formula = High ~ . - Sales, data = Carseats)
Variables actually used in tree construction:
[1] "ShelveLoc" "Price" "Income"
[4] "CompPrice" "Population" "Advertising"
[7] "Age" "US"
Number of terminal nodes: 27
Residual mean deviance: 0.4575 = 170.7 / 373
Misclassification error rate: 0.09 = 36 / 400
plot(tree.carseats)
text(tree.carseats, pretty = 0)

tree.carseats
node), split, n, deviance, yval, (yprob)
* denotes terminal node
1) root 400 541.500 No ( 0.59000 0.41000 )
2) ShelveLoc: Bad,Medium 315 390.600 No ( 0.68889 0.31111 )
4) Price < 92.5 46 56.530 Yes ( 0.30435 0.69565 )
8) Income < 57 10 12.220 No ( 0.70000 0.30000 )
16) CompPrice < 110.5 5 0.000 No ( 1.00000 0.00000 ) *
17) CompPrice > 110.5 5 6.730 Yes ( 0.40000 0.60000 ) *
9) Income > 57 36 35.470 Yes ( 0.19444 0.80556 )
18) Population < 207.5 16 21.170 Yes ( 0.37500 0.62500 ) *
19) Population > 207.5 20 7.941 Yes ( 0.05000 0.95000 ) *
5) Price > 92.5 269 299.800 No ( 0.75465 0.24535 )
10) Advertising < 13.5 224 213.200 No ( 0.81696 0.18304 )
20) CompPrice < 124.5 96 44.890 No ( 0.93750 0.06250 )
40) Price < 106.5 38 33.150 No ( 0.84211 0.15789 )
80) Population < 177 12 16.300 No ( 0.58333 0.41667 )
160) Income < 60.5 6 0.000 No ( 1.00000 0.00000 ) *
161) Income > 60.5 6 5.407 Yes ( 0.16667 0.83333 ) *
81) Population > 177 26 8.477 No ( 0.96154 0.03846 ) *
41) Price > 106.5 58 0.000 No ( 1.00000 0.00000 ) *
21) CompPrice > 124.5 128 150.200 No ( 0.72656 0.27344 )
42) Price < 122.5 51 70.680 Yes ( 0.49020 0.50980 )
84) ShelveLoc: Bad 11 6.702 No ( 0.90909 0.09091 ) *
85) ShelveLoc: Medium 40 52.930 Yes ( 0.37500 0.62500 )
170) Price < 109.5 16 7.481 Yes ( 0.06250 0.93750 ) *
171) Price > 109.5 24 32.600 No ( 0.58333 0.41667 )
342) Age < 49.5 13 16.050 Yes ( 0.30769 0.69231 ) *
343) Age > 49.5 11 6.702 No ( 0.90909 0.09091 ) *
43) Price > 122.5 77 55.540 No ( 0.88312 0.11688 )
86) CompPrice < 147.5 58 17.400 No ( 0.96552 0.03448 ) *
87) CompPrice > 147.5 19 25.010 No ( 0.63158 0.36842 )
174) Price < 147 12 16.300 Yes ( 0.41667 0.58333 )
348) CompPrice < 152.5 7 5.742 Yes ( 0.14286 0.85714 ) *
349) CompPrice > 152.5 5 5.004 No ( 0.80000 0.20000 ) *
175) Price > 147 7 0.000 No ( 1.00000 0.00000 ) *
11) Advertising > 13.5 45 61.830 Yes ( 0.44444 0.55556 )
22) Age < 54.5 25 25.020 Yes ( 0.20000 0.80000 )
44) CompPrice < 130.5 14 18.250 Yes ( 0.35714 0.64286 )
88) Income < 100 9 12.370 No ( 0.55556 0.44444 ) *
89) Income > 100 5 0.000 Yes ( 0.00000 1.00000 ) *
45) CompPrice > 130.5 11 0.000 Yes ( 0.00000 1.00000 ) *
23) Age > 54.5 20 22.490 No ( 0.75000 0.25000 )
46) CompPrice < 122.5 10 0.000 No ( 1.00000 0.00000 ) *
47) CompPrice > 122.5 10 13.860 No ( 0.50000 0.50000 )
94) Price < 125 5 0.000 Yes ( 0.00000 1.00000 ) *
95) Price > 125 5 0.000 No ( 1.00000 0.00000 ) *
3) ShelveLoc: Good 85 90.330 Yes ( 0.22353 0.77647 )
6) Price < 135 68 49.260 Yes ( 0.11765 0.88235 )
12) US: No 17 22.070 Yes ( 0.35294 0.64706 )
24) Price < 109 8 0.000 Yes ( 0.00000 1.00000 ) *
25) Price > 109 9 11.460 No ( 0.66667 0.33333 ) *
13) US: Yes 51 16.880 Yes ( 0.03922 0.96078 ) *
7) Price > 135 17 22.070 No ( 0.64706 0.35294 )
14) Income < 46 6 0.000 No ( 1.00000 0.00000 ) *
15) Income > 46 11 15.160 Yes ( 0.45455 0.54545 ) *
set.seed(2)
train = sample(1:nrow(Carseats), 200)
Carseats.test = Carseats[-train,]
High.test = High[-train]
tree.carseats = tree(High ~ . - Sales, Carseats, subset = train)
tree.pred = predict(tree.carseats, Carseats.test, type = "class")
table(tree.pred, High.test)
High.test
tree.pred No Yes
No 86 27
Yes 30 57
(86+57)/200
[1] 0.715
set.seed(3)
cv.carseats = cv.tree(tree.carseats, FUN = prune.misclass)
names(cv.carseats)
[1] "size" "dev" "k" "method"
cv.carseats
$size
[1] 19 17 14 13 9 7 3 2 1
$dev
[1] 55 55 53 52 50 56 69 65 80
$k
[1] -Inf 0.0000000 0.6666667 1.0000000
[5] 1.7500000 2.0000000 4.2500000 5.0000000
[9] 23.0000000
$method
[1] "misclass"
attr(,"class")
[1] "prune" "tree.sequence"
par(mfrow = c(1,2))
plot(cv.carseats$size, cv.carseats$dev, type = "b")
plot(cv.carseats$k, cv.carseats$dev, type = "b")

prune.carseats = prune.misclass(tree.carseats, best = 9)
plot(prune.carseats)
text(prune.carseats, pretty = 0)

tree.pred = predict(prune.carseats, Carseats.test, type = "class")
table(tree.pred, High.test)
High.test
tree.pred No Yes
No 94 24
Yes 22 60
(96+60)/200
[1] 0.78
prune.carseats = prune.misclass(tree.carseats, best = 15)
plot(prune.carseats)
text(prune.carseats, pretty = 0)

tree.pred = predict(prune.carseats, Carseats.test, type = "class")
table(tree.pred, High.test)
High.test
tree.pred No Yes
No 86 22
Yes 30 62
(86+62)/200
[1] 0.74
Fitting Regression Trees
library(MASS)
set.seed(1)
train = sample(1:nrow(Boston), nrow(Boston)/2)
tree.boston = tree(medv ~., Boston, subset = train)
summary(tree.boston)
Regression tree:
tree(formula = medv ~ ., data = Boston, subset = train)
Variables actually used in tree construction:
[1] "lstat" "rm" "dis"
Number of terminal nodes: 8
Residual mean deviance: 12.65 = 3099 / 245
Distribution of residuals:
Min. 1st Qu. Median Mean 3rd Qu.
-14.10000 -2.04200 -0.05357 0.00000 1.96000
Max.
12.60000
plot(tree.boston)
text(tree.boston, pretty = 0)

cv.boston = cv.tree(tree.boston)
plot(cv.boston$size, cv.boston$dev, type = "b")

prune.boston = prune.tree(tree.boston, best = 5)
plot(prune.boston)
text(prune.boston, pretty = 0)

yhat = predict(tree.boston, newdata = Boston[-train,])
boston.test = Boston[-train, "medv"]
plot(yhat, boston.test)
abline(0, 1)

mean((yhat-boston.test)^2)
[1] 25.04559
Bagging and Random Forest
library(randomForest)
set.seed(1)
bag.boston = randomForest(medv ~ ., data = Boston, subset = train, mtry = 13, importance = TRUE)
bag.boston
Call:
randomForest(formula = medv ~ ., data = Boston, mtry = 13, importance = TRUE, subset = train)
Type of random forest: regression
Number of trees: 500
No. of variables tried at each split: 13
Mean of squared residuals: 11.02509
% Var explained: 86.65
yhat.bag = predict(bag.boston, newdata = Boston[-train,])
plot(yhat.bag, boston.test)
abline(0,1)

mean((yhat.bag-boston.test)^2)
[1] 13.47349
bag.boston = randomForest(medv ~ ., data = Boston, subset = train, mtry = 13, ntree = 25)
yhat.bag = predict(bag.boston, newdata = Boston[-train,])
mean((yhat.bag-boston.test)^2)
[1] 13.43068
set.seed(1)
rf.boston = randomForest(medv ~ ., data = Boston, subset = train, mtry = 6, importance = TRUE)
yhat.rf = predict(rf.boston, newdata = Boston[-train,])
mean((yhat.rf-boston.test)^2)
[1] 11.48022
importance(rf.boston)
varImpPlot(rf.boston)
Boosting
library(gbm)
Loaded gbm 2.1.3
set.seed(1)
boost.boston = gbm(medv ~ ., data = Boston[train,], distribution = "gaussian", n.trees = 5000, interaction.depth = 4)
summary(boost.boston)
par(mfrow = c(1,2))
plot(boost.boston, i = "rm")
plot(boost.boston, i = "lstat")

yhat.boost = predict(boost.boston, newdata = Boston[-train,], n.trees = 5000)
mean((yhat.boost-boston.test)^2)
[1] 11.84434
boost.boston = gbm(medv ~ ., data = Boston[train,], distribution = "gaussian", n.trees = 5000, interaction.depth = 4, shrinkage = 0.2, verbose = F)
yhat.boost = predict(boost.boston, newdata = Boston[-train,], n.trees = 5000)
mean((yhat.boost-boston.test)^2)
[1] 11.51109
LS0tDQp0aXRsZTogIkxhYm9yYXRvcmlvIDgiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCmF1dGhvcjogIkx1aXMgSmltZW5leiINCi0tLQ0KDQojIyMgTGFiIDg6IERlY2lzaW9uIFRyZWVzDQoNCiMjIyMgRml0dGluZyBDbGFzc2lmaWNhdGlvbiBUcmVlcw0KDQpgYGB7cn0NCmxpYnJhcnkodHJlZSkNCmBgYA0KDQpgYGB7cn0NCmxpYnJhcnkoSVNMUikNCmF0dGFjaChDYXJzZWF0cykNCkhpZ2ggPSBpZmVsc2UoU2FsZXMgPD0gOCwgIk5vIiwgIlllcyIpDQpgYGANCg0KYGBge3J9DQpDYXJzZWF0cyA9IGRhdGEuZnJhbWUoQ2Fyc2VhdHMsIEhpZ2gpDQpgYGANCg0KYGBge3J9DQp0cmVlLmNhcnNlYXRzID0gdHJlZShIaWdoIH4gLiAtIFNhbGVzLCBDYXJzZWF0cykNCmBgYA0KDQpgYGB7cn0NCnN1bW1hcnkodHJlZS5jYXJzZWF0cykNCmBgYA0KDQpgYGB7cn0NCnBsb3QodHJlZS5jYXJzZWF0cykNCnRleHQodHJlZS5jYXJzZWF0cywgcHJldHR5ID0gMCkNCmBgYA0KDQpgYGB7cn0NCnRyZWUuY2Fyc2VhdHMNCmBgYA0KDQpgYGB7cn0NCnNldC5zZWVkKDIpDQp0cmFpbiA9IHNhbXBsZSgxOm5yb3coQ2Fyc2VhdHMpLCAyMDApDQpDYXJzZWF0cy50ZXN0ID0gQ2Fyc2VhdHNbLXRyYWluLF0NCkhpZ2gudGVzdCA9IEhpZ2hbLXRyYWluXQ0KdHJlZS5jYXJzZWF0cyA9IHRyZWUoSGlnaCB+IC4gLSBTYWxlcywgQ2Fyc2VhdHMsIHN1YnNldCA9IHRyYWluKQ0KdHJlZS5wcmVkID0gcHJlZGljdCh0cmVlLmNhcnNlYXRzLCBDYXJzZWF0cy50ZXN0LCB0eXBlID0gImNsYXNzIikNCnRhYmxlKHRyZWUucHJlZCwgSGlnaC50ZXN0KQ0KKDg2KzU3KS8yMDANCmBgYA0KDQpgYGB7cn0NCnNldC5zZWVkKDMpDQpjdi5jYXJzZWF0cyA9IGN2LnRyZWUodHJlZS5jYXJzZWF0cywgRlVOID0gcHJ1bmUubWlzY2xhc3MpDQpuYW1lcyhjdi5jYXJzZWF0cykNCmN2LmNhcnNlYXRzDQpgYGANCg0KYGBge3J9DQpwYXIobWZyb3cgPSBjKDEsMikpDQpwbG90KGN2LmNhcnNlYXRzJHNpemUsIGN2LmNhcnNlYXRzJGRldiwgdHlwZSA9ICJiIikNCnBsb3QoY3YuY2Fyc2VhdHMkaywgY3YuY2Fyc2VhdHMkZGV2LCB0eXBlID0gImIiKQ0KYGBgDQoNCmBgYHtyfQ0KcHJ1bmUuY2Fyc2VhdHMgPSBwcnVuZS5taXNjbGFzcyh0cmVlLmNhcnNlYXRzLCBiZXN0ID0gOSkNCnBsb3QocHJ1bmUuY2Fyc2VhdHMpDQp0ZXh0KHBydW5lLmNhcnNlYXRzLCBwcmV0dHkgPSAwKQ0KYGBgDQoNCmBgYHtyfQ0KdHJlZS5wcmVkID0gcHJlZGljdChwcnVuZS5jYXJzZWF0cywgQ2Fyc2VhdHMudGVzdCwgdHlwZSA9ICJjbGFzcyIpDQp0YWJsZSh0cmVlLnByZWQsIEhpZ2gudGVzdCkNCig5Nis2MCkvMjAwDQpgYGANCg0KYGBge3J9DQpwcnVuZS5jYXJzZWF0cyA9IHBydW5lLm1pc2NsYXNzKHRyZWUuY2Fyc2VhdHMsIGJlc3QgPSAxNSkNCnBsb3QocHJ1bmUuY2Fyc2VhdHMpDQp0ZXh0KHBydW5lLmNhcnNlYXRzLCBwcmV0dHkgPSAwKQ0KdHJlZS5wcmVkID0gcHJlZGljdChwcnVuZS5jYXJzZWF0cywgQ2Fyc2VhdHMudGVzdCwgdHlwZSA9ICJjbGFzcyIpDQp0YWJsZSh0cmVlLnByZWQsIEhpZ2gudGVzdCkNCig4Nis2MikvMjAwDQpgYGANCg0KIyMjIyBGaXR0aW5nIFJlZ3Jlc3Npb24gVHJlZXMNCg0KYGBge3J9DQpsaWJyYXJ5KE1BU1MpDQpzZXQuc2VlZCgxKQ0KdHJhaW4gPSBzYW1wbGUoMTpucm93KEJvc3RvbiksIG5yb3coQm9zdG9uKS8yKQ0KdHJlZS5ib3N0b24gPSB0cmVlKG1lZHYgfi4sIEJvc3Rvbiwgc3Vic2V0ID0gdHJhaW4pDQpzdW1tYXJ5KHRyZWUuYm9zdG9uKQ0KYGBgDQoNCmBgYHtyfQ0KcGxvdCh0cmVlLmJvc3RvbikNCnRleHQodHJlZS5ib3N0b24sIHByZXR0eSA9IDApDQpgYGANCg0KYGBge3J9DQpjdi5ib3N0b24gPSBjdi50cmVlKHRyZWUuYm9zdG9uKQ0KcGxvdChjdi5ib3N0b24kc2l6ZSwgY3YuYm9zdG9uJGRldiwgdHlwZSA9ICJiIikNCmBgYA0KDQpgYGB7cn0NCnBydW5lLmJvc3RvbiA9IHBydW5lLnRyZWUodHJlZS5ib3N0b24sIGJlc3QgPSA1KQ0KcGxvdChwcnVuZS5ib3N0b24pDQp0ZXh0KHBydW5lLmJvc3RvbiwgcHJldHR5ID0gMCkNCmBgYA0KDQpgYGB7cn0NCnloYXQgPSBwcmVkaWN0KHRyZWUuYm9zdG9uLCBuZXdkYXRhID0gQm9zdG9uWy10cmFpbixdKQ0KYm9zdG9uLnRlc3QgPSBCb3N0b25bLXRyYWluLCAibWVkdiJdDQpwbG90KHloYXQsIGJvc3Rvbi50ZXN0KQ0KYWJsaW5lKDAsIDEpDQptZWFuKCh5aGF0LWJvc3Rvbi50ZXN0KV4yKQ0KYGBgDQoNCiMjIyMgQmFnZ2luZyBhbmQgUmFuZG9tIEZvcmVzdA0KDQpgYGB7cn0NCmxpYnJhcnkocmFuZG9tRm9yZXN0KQ0Kc2V0LnNlZWQoMSkNCmJhZy5ib3N0b24gPSByYW5kb21Gb3Jlc3QobWVkdiB+IC4sIGRhdGEgPSBCb3N0b24sIHN1YnNldCA9IHRyYWluLCBtdHJ5ID0gMTMsIGltcG9ydGFuY2UgPSBUUlVFKQ0KYmFnLmJvc3Rvbg0KYGBgDQoNCmBgYHtyfQ0KeWhhdC5iYWcgPSBwcmVkaWN0KGJhZy5ib3N0b24sIG5ld2RhdGEgPSBCb3N0b25bLXRyYWluLF0pDQpwbG90KHloYXQuYmFnLCBib3N0b24udGVzdCkNCmFibGluZSgwLDEpDQptZWFuKCh5aGF0LmJhZy1ib3N0b24udGVzdCleMikNCmBgYA0KDQpgYGB7cn0NCmJhZy5ib3N0b24gPSByYW5kb21Gb3Jlc3QobWVkdiB+IC4sIGRhdGEgPSBCb3N0b24sIHN1YnNldCA9IHRyYWluLCBtdHJ5ID0gMTMsIG50cmVlID0gMjUpDQp5aGF0LmJhZyA9IHByZWRpY3QoYmFnLmJvc3RvbiwgbmV3ZGF0YSA9IEJvc3RvblstdHJhaW4sXSkNCm1lYW4oKHloYXQuYmFnLWJvc3Rvbi50ZXN0KV4yKQ0KYGBgDQoNCmBgYHtyfQ0Kc2V0LnNlZWQoMSkNCnJmLmJvc3RvbiA9IHJhbmRvbUZvcmVzdChtZWR2IH4gLiwgZGF0YSA9IEJvc3Rvbiwgc3Vic2V0ID0gdHJhaW4sIG10cnkgPSA2LCBpbXBvcnRhbmNlID0gVFJVRSkNCnloYXQucmYgPSBwcmVkaWN0KHJmLmJvc3RvbiwgbmV3ZGF0YSA9IEJvc3RvblstdHJhaW4sXSkNCm1lYW4oKHloYXQucmYtYm9zdG9uLnRlc3QpXjIpDQpgYGANCg0KYGBge3J9DQppbXBvcnRhbmNlKHJmLmJvc3RvbikNCmBgYA0KDQpgYGB7cn0NCnZhckltcFBsb3QocmYuYm9zdG9uKQ0KYGBgDQoNCiMjIyMgQm9vc3RpbmcNCg0KYGBge3J9DQpsaWJyYXJ5KGdibSkNCnNldC5zZWVkKDEpDQpib29zdC5ib3N0b24gPSBnYm0obWVkdiB+IC4sIGRhdGEgPSBCb3N0b25bdHJhaW4sXSwgZGlzdHJpYnV0aW9uID0gImdhdXNzaWFuIiwgbi50cmVlcyA9IDUwMDAsIGludGVyYWN0aW9uLmRlcHRoID0gNCkNCmBgYA0KDQpgYGB7cn0NCnN1bW1hcnkoYm9vc3QuYm9zdG9uKQ0KYGBgDQoNCmBgYHtyfQ0KcGFyKG1mcm93ID0gYygxLDIpKQ0KcGxvdChib29zdC5ib3N0b24sIGkgPSAicm0iKQ0KcGxvdChib29zdC5ib3N0b24sIGkgPSAibHN0YXQiKQ0KYGBgDQoNCmBgYHtyfQ0KeWhhdC5ib29zdCA9IHByZWRpY3QoYm9vc3QuYm9zdG9uLCBuZXdkYXRhID0gQm9zdG9uWy10cmFpbixdLCBuLnRyZWVzID0gNTAwMCkNCm1lYW4oKHloYXQuYm9vc3QtYm9zdG9uLnRlc3QpXjIpDQpgYGANCg0KYGBge3J9DQpib29zdC5ib3N0b24gPSBnYm0obWVkdiB+IC4sIGRhdGEgPSBCb3N0b25bdHJhaW4sXSwgZGlzdHJpYnV0aW9uID0gImdhdXNzaWFuIiwgbi50cmVlcyA9IDUwMDAsIGludGVyYWN0aW9uLmRlcHRoID0gNCwgc2hyaW5rYWdlID0gMC4yLCB2ZXJib3NlID0gRikNCnloYXQuYm9vc3QgPSBwcmVkaWN0KGJvb3N0LmJvc3RvbiwgbmV3ZGF0YSA9IEJvc3RvblstdHJhaW4sXSwgbi50cmVlcyA9IDUwMDApDQptZWFuKCh5aGF0LmJvb3N0LWJvc3Rvbi50ZXN0KV4yKQ0KYGBg