This code reflects an update on the code from ISLR, which uses the tree
package. Instead this code will focus on using the rpart
package.
We will need several packages for today’s code:
#install.packages("ISLR")
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.2
#install.packages("rpart")
library(rpart)
#install.packages("MASS")
library(MASS)
## Warning: package 'MASS' was built under R version 3.6.2
First we want to partition our data into a training and testing set.
# partition the dataset into training and testing
set.seed(1)
train<-sample(1:nrow(Boston), nrow(Boston)/2)
# train - grow your tree
# use method="anova" for regression
tree.boston <- rpart(medv~., Boston, subset=train,
method="anova")
summary(tree.boston ) # detailed summary of splits
## Call:
## rpart(formula = medv ~ ., data = Boston, subset = train, method = "anova")
## n= 253
##
## CP nsplit rel error xerror xstd
## 1 0.55145333 0 1.0000000 1.0024582 0.12627136
## 2 0.17610053 1 0.4485467 0.4845589 0.04276338
## 3 0.05689532 2 0.2724461 0.3031982 0.03060864
## 4 0.04093613 3 0.2155508 0.2760398 0.02989290
## 5 0.03276814 4 0.1746147 0.2387489 0.02848397
## 6 0.01048773 5 0.1418465 0.2169243 0.02729011
## 7 0.01000000 6 0.1313588 0.2177695 0.02733643
##
## Variable importance
## rm lstat crim dis age nox zn indus tax ptratio
## 34 23 9 7 6 5 4 4 4 4
##
## Node number 1: 253 observations, complexity param=0.5514533
## mean=21.78656, MSE=76.86907
## left son=2 (222 obs) right son=3 (31 obs)
## Primary splits:
## rm < 6.9595 to the left, improve=0.5514533, (0 missing)
## lstat < 8.13 to the right, improve=0.4710854, (0 missing)
## ptratio < 19.65 to the right, improve=0.2687694, (0 missing)
## indus < 6.66 to the right, improve=0.2552622, (0 missing)
## nox < 0.5125 to the right, improve=0.2357242, (0 missing)
## Surrogate splits:
## lstat < 4.6 to the right, agree=0.925, adj=0.387, (0 split)
## indus < 1.605 to the right, agree=0.893, adj=0.129, (0 split)
## ptratio < 14.15 to the right, agree=0.893, adj=0.129, (0 split)
## zn < 86.25 to the left, agree=0.889, adj=0.097, (0 split)
## crim < 0.01958 to the right, agree=0.885, adj=0.065, (0 split)
##
## Node number 2: 222 observations, complexity param=0.1761005
## mean=19.3536, MSE=30.60492
## left son=4 (87 obs) right son=5 (135 obs)
## Primary splits:
## lstat < 14.405 to the right, improve=0.5040674, (0 missing)
## crim < 6.108565 to the right, improve=0.3586261, (0 missing)
## dis < 2.23995 to the left, improve=0.3565564, (0 missing)
## age < 93.1 to the right, improve=0.3387831, (0 missing)
## nox < 0.5835 to the right, improve=0.3244859, (0 missing)
## Surrogate splits:
## age < 88.1 to the right, agree=0.829, adj=0.563, (0 split)
## dis < 2.23995 to the left, agree=0.815, adj=0.529, (0 split)
## nox < 0.5835 to the right, agree=0.779, adj=0.437, (0 split)
## tax < 431 to the right, agree=0.775, adj=0.425, (0 split)
## crim < 5.24741 to the right, agree=0.770, adj=0.414, (0 split)
##
## Node number 3: 31 observations, complexity param=0.05689532
## mean=39.20968, MSE=62.22539
## left son=6 (16 obs) right son=7 (15 obs)
## Primary splits:
## rm < 7.553 to the left, improve=0.5736135, (0 missing)
## lstat < 4.52 to the right, improve=0.5430443, (0 missing)
## dis < 3.39285 to the right, improve=0.2565775, (0 missing)
## crim < 0.260035 to the left, improve=0.2074717, (0 missing)
## nox < 0.4965 to the left, improve=0.1890524, (0 missing)
## Surrogate splits:
## lstat < 4.52 to the right, agree=0.806, adj=0.600, (0 split)
## crim < 0.11276 to the left, agree=0.742, adj=0.467, (0 split)
## zn < 27.5 to the right, agree=0.742, adj=0.467, (0 split)
## dis < 4.74095 to the right, agree=0.710, adj=0.400, (0 split)
## nox < 0.48 to the left, agree=0.677, adj=0.333, (0 split)
##
## Node number 4: 87 observations, complexity param=0.03276814
## mean=14.46092, MSE=17.85962
## left son=8 (26 obs) right son=9 (61 obs)
## Primary splits:
## crim < 11.48635 to the right, improve=0.4101403, (0 missing)
## lstat < 19.645 to the right, improve=0.3149683, (0 missing)
## nox < 0.6615 to the right, improve=0.2835297, (0 missing)
## tax < 551.5 to the right, improve=0.2799295, (0 missing)
## dis < 2.0037 to the left, improve=0.2601665, (0 missing)
## Surrogate splits:
## age < 99 to the right, agree=0.805, adj=0.346, (0 split)
## dis < 1.66345 to the left, agree=0.793, adj=0.308, (0 split)
## black < 221.785 to the left, agree=0.782, adj=0.269, (0 split)
## rm < 5.3695 to the left, agree=0.759, adj=0.192, (0 split)
## lstat < 30.06 to the right, agree=0.747, adj=0.154, (0 split)
##
## Node number 5: 135 observations, complexity param=0.04093613
## mean=22.50667, MSE=13.44981
## left son=10 (111 obs) right son=11 (24 obs)
## Primary splits:
## rm < 6.543 to the left, improve=0.4384591, (0 missing)
## lstat < 7.76 to the right, improve=0.3773263, (0 missing)
## nox < 0.5125 to the right, improve=0.1511574, (0 missing)
## age < 33.8 to the right, improve=0.1421214, (0 missing)
## ptratio < 18.65 to the right, improve=0.1184029, (0 missing)
## Surrogate splits:
## lstat < 5.055 to the right, agree=0.874, adj=0.292, (0 split)
## crim < 0.02902 to the right, agree=0.830, adj=0.042, (0 split)
##
## Node number 6: 16 observations
## mean=33.425, MSE=31.59312
##
## Node number 7: 15 observations
## mean=45.38, MSE=21.1336
##
## Node number 8: 26 observations
## mean=10.31538, MSE=11.64284
##
## Node number 9: 61 observations, complexity param=0.01048773
## mean=16.22787, MSE=10.06234
## left son=18 (31 obs) right son=19 (30 obs)
## Primary splits:
## age < 93.95 to the right, improve=0.3322959, (0 missing)
## lstat < 18.825 to the right, improve=0.2782521, (0 missing)
## crim < 0.711085 to the right, improve=0.2045230, (0 missing)
## ptratio < 19.45 to the right, improve=0.1866394, (0 missing)
## black < 344.48 to the left, improve=0.1796701, (0 missing)
## Surrogate splits:
## dis < 2.2085 to the left, agree=0.787, adj=0.567, (0 split)
## indus < 16.01 to the right, agree=0.721, adj=0.433, (0 split)
## nox < 0.597 to the right, agree=0.721, adj=0.433, (0 split)
## lstat < 17.995 to the right, agree=0.721, adj=0.433, (0 split)
## crim < 0.308165 to the right, agree=0.705, adj=0.400, (0 split)
##
## Node number 10: 111 observations
## mean=21.37748, MSE=6.875078
##
## Node number 11: 24 observations
## mean=27.72917, MSE=10.68623
##
## Node number 18: 31 observations
## mean=14.42903, MSE=5.294318
##
## Node number 19: 30 observations
## mean=18.08667, MSE=8.190489
# plot tree
par(mfrow=c(1,1))
plot(tree.boston , uniform=TRUE,margin=0.2,
main="Regression Tree for Median Home Value ")
text(tree.boston , use.n=TRUE, all=TRUE, cex=.8)
#Using cross-validation to select complexity
printcp(tree.boston ) # display the results
##
## Regression tree:
## rpart(formula = medv ~ ., data = Boston, subset = train, method = "anova")
##
## Variables actually used in tree construction:
## [1] age crim lstat rm
##
## Root node error: 19448/253 = 76.869
##
## n= 253
##
## CP nsplit rel error xerror xstd
## 1 0.551453 0 1.00000 1.00246 0.126271
## 2 0.176101 1 0.44855 0.48456 0.042763
## 3 0.056895 2 0.27245 0.30320 0.030609
## 4 0.040936 3 0.21555 0.27604 0.029893
## 5 0.032768 4 0.17461 0.23875 0.028484
## 6 0.010488 5 0.14185 0.21692 0.027290
## 7 0.010000 6 0.13136 0.21777 0.027336
plotcp(tree.boston ) # visualize cross-validation results
# create additional plots
par(mfrow=c(1,2)) # two plots on one page
rsq.rpart(tree.boston) # visualize cross-validation results
##
## Regression tree:
## rpart(formula = medv ~ ., data = Boston, subset = train, method = "anova")
##
## Variables actually used in tree construction:
## [1] age crim lstat rm
##
## Root node error: 19448/253 = 76.869
##
## n= 253
##
## CP nsplit rel error xerror xstd
## 1 0.551453 0 1.00000 1.00246 0.126271
## 2 0.176101 1 0.44855 0.48456 0.042763
## 3 0.056895 2 0.27245 0.30320 0.030609
## 4 0.040936 3 0.21555 0.27604 0.029893
## 5 0.032768 4 0.17461 0.23875 0.028484
## 6 0.010488 5 0.14185 0.21692 0.027290
## 7 0.010000 6 0.13136 0.21777 0.027336
# minimize the error
tree.boston$cptable
## CP nsplit rel error xerror xstd
## 1 0.55145333 0 1.0000000 1.0024582 0.12627136
## 2 0.17610053 1 0.4485467 0.4845589 0.04276338
## 3 0.05689532 2 0.2724461 0.3031982 0.03060864
## 4 0.04093613 3 0.2155508 0.2760398 0.02989290
## 5 0.03276814 4 0.1746147 0.2387489 0.02848397
## 6 0.01048773 5 0.1418465 0.2169243 0.02729011
## 7 0.01000000 6 0.1313588 0.2177695 0.02733643
which.min(tree.boston$cptable[,"xerror"])
## 6
## 6
tree.boston$cptable[which.min(tree.boston$cptable[,"xerror"]),"CP"]
## [1] 0.01048773
# prune the tree
pfit<- prune(tree.boston, cp=0.01) # from cptable
# plot the pruned tree
plot(pfit, uniform=TRUE,margin=0.2,
main="Pruned Regression Tree for Nsplit=6")
text(pfit, use.n=TRUE, all=TRUE, cex=.8)
Well that really wasn’t pruning… so lets try again
# what about 5 nodes
# prune the tree
pfit5<- prune(tree.boston, cp= 0.03276814) # from cptable
# plot the pruned tree for 5 nodes
plot(pfit5, uniform=TRUE,margin=0.2,
main="Pruned Regression Tree for 5 Nodes")
text(pfit5, use.n=TRUE, all=TRUE, cex=.8)
# making predictions
# make predictions
yhat<-predict(tree.boston, newdata=Boston[-train,])
boston.test<-Boston[-train, "medv"]
plot(yhat, boston.test)
abline(0,1)
# mse
mean((yhat-boston.test)^2)
## [1] 35.28688
We will be using the carseat data from the book and making a new variable to seperate high sales.
# CLASSIFICATION TREE
## DO NOT USE attach
library(ISLR)
data(Carseats)
# make new variable if Sales is greater than 8
High<-ifelse(Carseats$Sales<8, "No", "Yes")
# SAVE UNDER NEW NAME
Carseats2<-data.frame(Carseats, High)
First we want to partition our data into a training and testing set.
# test and train
set.seed(2)
dim(Carseats)
## [1] 400 11
# split in half
train<-sample(1:nrow(Carseats2), 200)
This is a full tree where the minimum number per split is 1!
# use method="class" for classification
tree.carseats<-rpart(High~.-Sales, data=Carseats2,
subset=train,
control=rpart.control(minsplit=1),
method="class")
summary(tree.carseats)
## Call:
## rpart(formula = High ~ . - Sales, data = Carseats2, subset = train,
## method = "class", control = rpart.control(minsplit = 1))
## n= 200
##
## CP nsplit rel error xerror xstd
## 1 0.22222222 0 1.00000000 1.0000000 0.08570694
## 2 0.11111111 1 0.77777778 0.9753086 0.08535053
## 3 0.04938272 2 0.66666667 0.9135802 0.08429486
## 4 0.03703704 5 0.50617284 0.9012346 0.08405495
## 5 0.02469136 6 0.46913580 0.9259259 0.08452509
## 6 0.01851852 11 0.34567901 0.9012346 0.08405495
## 7 0.01234568 15 0.27160494 0.8888889 0.08380525
## 8 0.01000000 30 0.08641975 0.9135802 0.08429486
##
## Variable importance
## Price CompPrice Population Age Income Advertising
## 21 16 16 12 12 10
## ShelveLoc Education US Urban
## 7 5 1 1
##
## Node number 1: 200 observations, complexity param=0.2222222
## predicted class=No expected loss=0.405 P(node) =1
## class counts: 119 81
## probabilities: 0.595 0.405
## left son=2 (160 obs) right son=3 (40 obs)
## Primary splits:
## Price < 96.5 to the right, improve=10.240000, (0 missing)
## ShelveLoc splits as LRL, improve= 9.825674, (0 missing)
## Advertising < 7.5 to the left, improve= 7.885091, (0 missing)
## Age < 60.5 to the right, improve= 5.520667, (0 missing)
## Income < 61.5 to the left, improve= 5.415000, (0 missing)
## Surrogate splits:
## CompPrice < 101 to the right, agree=0.835, adj=0.175, (0 split)
## Population < 502 to the left, agree=0.805, adj=0.025, (0 split)
##
## Node number 2: 160 observations, complexity param=0.1111111
## predicted class=No expected loss=0.325 P(node) =0.8
## class counts: 108 52
## probabilities: 0.675 0.325
## left son=4 (135 obs) right son=5 (25 obs)
## Primary splits:
## ShelveLoc splits as LRL, improve=7.468148, (0 missing)
## Advertising < 6.5 to the left, improve=6.050000, (0 missing)
## Income < 61.5 to the left, improve=4.907110, (0 missing)
## Price < 137 to the right, improve=4.250179, (0 missing)
## Age < 60.5 to the right, improve=3.498223, (0 missing)
##
## Node number 3: 40 observations, complexity param=0.02469136
## predicted class=Yes expected loss=0.275 P(node) =0.2
## class counts: 11 29
## probabilities: 0.275 0.725
## left son=6 (11 obs) right son=7 (29 obs)
## Primary splits:
## Age < 64.5 to the right, improve=2.219592, (0 missing)
## Advertising < 7 to the left, improve=1.758081, (0 missing)
## Population < 414 to the left, improve=1.756452, (0 missing)
## ShelveLoc splits as LRL, improve=1.512500, (0 missing)
## US splits as LR, improve=1.106010, (0 missing)
##
## Node number 4: 135 observations, complexity param=0.04938272
## predicted class=No expected loss=0.2592593 P(node) =0.675
## class counts: 100 35
## probabilities: 0.741 0.259
## left son=8 (53 obs) right son=9 (82 obs)
## Primary splits:
## Price < 124.5 to the right, improve=4.746468, (0 missing)
## Advertising < 7.5 to the left, improve=4.561486, (0 missing)
## CompPrice < 143.5 to the left, improve=3.383383, (0 missing)
## Population < 498 to the left, improve=3.367003, (0 missing)
## Age < 49.5 to the right, improve=3.325185, (0 missing)
## Surrogate splits:
## CompPrice < 146 to the right, agree=0.674, adj=0.170, (0 split)
## Population < 478.5 to the right, agree=0.630, adj=0.057, (0 split)
## Advertising < 24 to the right, agree=0.622, adj=0.038, (0 split)
## Age < 28.5 to the left, agree=0.622, adj=0.038, (0 split)
## Education < 17.5 to the right, agree=0.622, adj=0.038, (0 split)
##
## Node number 5: 25 observations, complexity param=0.03703704
## predicted class=Yes expected loss=0.32 P(node) =0.125
## class counts: 8 17
## probabilities: 0.320 0.680
## left son=10 (3 obs) right son=11 (22 obs)
## Primary splits:
## Price < 157 to the right, improve=3.152727, (0 missing)
## Income < 43 to the left, improve=3.022857, (0 missing)
## Age < 68.5 to the right, improve=2.188824, (0 missing)
## US splits as LR, improve=2.188824, (0 missing)
## Advertising < 6 to the left, improve=1.996883, (0 missing)
##
## Node number 6: 11 observations, complexity param=0.02469136
## predicted class=No expected loss=0.4545455 P(node) =0.055
## class counts: 6 5
## probabilities: 0.545 0.455
## left son=12 (8 obs) right son=13 (3 obs)
## Primary splits:
## Population < 339 to the left, improve=2.454545, (0 missing)
## CompPrice < 117 to the right, improve=1.704545, (0 missing)
## Price < 90.5 to the right, improve=1.704545, (0 missing)
## Advertising < 8 to the left, improve=1.187879, (0 missing)
## Income < 68.5 to the right, improve=1.097403, (0 missing)
## Surrogate splits:
## Income < 91.5 to the left, agree=0.818, adj=0.333, (0 split)
##
## Node number 7: 29 observations, complexity param=0.01234568
## predicted class=Yes expected loss=0.1724138 P(node) =0.145
## class counts: 5 24
## probabilities: 0.172 0.828
## left son=14 (3 obs) right son=15 (26 obs)
## Primary splits:
## Income < 26.5 to the left, improve=1.6348360, (0 missing)
## ShelveLoc splits as LRR, improve=1.6236880, (0 missing)
## Advertising < 2 to the left, improve=1.6091950, (0 missing)
## Education < 13.5 to the right, improve=1.4008620, (0 missing)
## US splits as LR, improve=0.8624005, (0 missing)
##
## Node number 8: 53 observations, complexity param=0.02469136
## predicted class=No expected loss=0.09433962 P(node) =0.265
## class counts: 48 5
## probabilities: 0.906 0.094
## left son=16 (51 obs) right son=17 (2 obs)
## Primary splits:
## Advertising < 23.5 to the left, improve=3.409545, (0 missing)
## CompPrice < 148.5 to the left, improve=2.013314, (0 missing)
## Population < 393.5 to the left, improve=1.688183, (0 missing)
## Income < 30 to the right, improve=1.031604, (0 missing)
## ShelveLoc splits as L-R, improve=0.527192, (0 missing)
##
## Node number 9: 82 observations, complexity param=0.04938272
## predicted class=No expected loss=0.3658537 P(node) =0.41
## class counts: 52 30
## probabilities: 0.634 0.366
## left son=18 (48 obs) right son=19 (34 obs)
## Primary splits:
## Age < 49.5 to the right, improve=7.364957, (0 missing)
## CompPrice < 129.5 to the left, improve=5.827570, (0 missing)
## Advertising < 10.5 to the left, improve=5.601642, (0 missing)
## Income < 51.5 to the left, improve=2.627905, (0 missing)
## Population < 440.5 to the left, improve=1.859280, (0 missing)
## Surrogate splits:
## Price < 118.5 to the left, agree=0.646, adj=0.147, (0 split)
## Income < 75.5 to the left, agree=0.634, adj=0.118, (0 split)
## CompPrice < 143 to the left, agree=0.622, adj=0.088, (0 split)
## Advertising < 10.5 to the left, agree=0.622, adj=0.088, (0 split)
## Population < 496 to the left, agree=0.610, adj=0.059, (0 split)
##
## Node number 10: 3 observations
## predicted class=No expected loss=0 P(node) =0.015
## class counts: 3 0
## probabilities: 1.000 0.000
##
## Node number 11: 22 observations, complexity param=0.02469136
## predicted class=Yes expected loss=0.2272727 P(node) =0.11
## class counts: 5 17
## probabilities: 0.227 0.773
## left son=22 (4 obs) right son=23 (18 obs)
## Primary splits:
## Income < 35.5 to the left, improve=2.671717, (0 missing)
## US splits as LR, improve=2.432035, (0 missing)
## Advertising < 1 to the left, improve=1.870130, (0 missing)
## Education < 14.5 to the left, improve=1.573427, (0 missing)
## Age < 61.5 to the right, improve=1.436674, (0 missing)
##
## Node number 12: 8 observations, complexity param=0.01234568
## predicted class=No expected loss=0.25 P(node) =0.04
## class counts: 6 2
## probabilities: 0.750 0.250
## left son=24 (5 obs) right son=25 (3 obs)
## Primary splits:
## Income < 68.5 to the right, improve=1.666667, (0 missing)
## Population < 30.5 to the right, improve=1.285714, (0 missing)
## ShelveLoc splits as LRL, improve=1.285714, (0 missing)
## Advertising < 8 to the left, improve=1.000000, (0 missing)
## Price < 89.5 to the right, improve=1.000000, (0 missing)
## Surrogate splits:
## Advertising < 8 to the left, agree=0.875, adj=0.667, (0 split)
## Age < 74 to the left, agree=0.875, adj=0.667, (0 split)
## CompPrice < 102 to the left, agree=0.750, adj=0.333, (0 split)
## Price < 90.5 to the right, agree=0.750, adj=0.333, (0 split)
## Education < 14 to the left, agree=0.750, adj=0.333, (0 split)
##
## Node number 13: 3 observations
## predicted class=Yes expected loss=0 P(node) =0.015
## class counts: 0 3
## probabilities: 0.000 1.000
##
## Node number 14: 3 observations, complexity param=0.01234568
## predicted class=No expected loss=0.3333333 P(node) =0.015
## class counts: 2 1
## probabilities: 0.667 0.333
## left son=28 (2 obs) right son=29 (1 obs)
## Primary splits:
## CompPrice < 113.5 to the left, improve=1.333333, (0 missing)
## Population < 346 to the right, improve=1.333333, (0 missing)
## ShelveLoc splits as LRL, improve=1.333333, (0 missing)
## Age < 31 to the right, improve=1.333333, (0 missing)
## Education < 12 to the right, improve=1.333333, (0 missing)
##
## Node number 15: 26 observations, complexity param=0.01234568
## predicted class=Yes expected loss=0.1153846 P(node) =0.13
## class counts: 3 23
## probabilities: 0.115 0.885
## left son=30 (5 obs) right son=31 (21 obs)
## Primary splits:
## ShelveLoc splits as LRR, improve=1.0029300, (0 missing)
## Advertising < 2 to the left, improve=0.8076923, (0 missing)
## Population < 207.5 to the left, improve=0.8076923, (0 missing)
## Education < 15.5 to the right, improve=0.8076923, (0 missing)
## CompPrice < 139 to the right, improve=0.6410256, (0 missing)
## Surrogate splits:
## Age < 43.5 to the left, agree=0.846, adj=0.2, (0 split)
##
## Node number 16: 51 observations, complexity param=0.01234568
## predicted class=No expected loss=0.05882353 P(node) =0.255
## class counts: 48 3
## probabilities: 0.941 0.059
## left son=32 (50 obs) right son=33 (1 obs)
## Primary splits:
## Population < 498 to the left, improve=1.8070590, (0 missing)
## CompPrice < 148.5 to the left, improve=1.4470590, (0 missing)
## Income < 29.5 to the right, improve=0.3172716, (0 missing)
## Education < 10.5 to the right, improve=0.3172716, (0 missing)
## Age < 47.5 to the left, improve=0.2899160, (0 missing)
##
## Node number 17: 2 observations
## predicted class=Yes expected loss=0 P(node) =0.01
## class counts: 0 2
## probabilities: 0.000 1.000
##
## Node number 18: 48 observations, complexity param=0.01851852
## predicted class=No expected loss=0.1875 P(node) =0.24
## class counts: 39 9
## probabilities: 0.813 0.188
## left son=36 (28 obs) right son=37 (20 obs)
## Primary splits:
## CompPrice < 124.5 to the left, improve=1.8107140, (0 missing)
## Advertising < 6.5 to the left, improve=1.0351630, (0 missing)
## Population < 442.5 to the left, improve=0.9525261, (0 missing)
## Price < 118.5 to the right, improve=0.6750000, (0 missing)
## Education < 10.5 to the right, improve=0.6750000, (0 missing)
## Surrogate splits:
## Price < 111.5 to the left, agree=0.729, adj=0.35, (0 split)
## Age < 60.5 to the right, agree=0.646, adj=0.15, (0 split)
## Population < 38 to the right, agree=0.625, adj=0.10, (0 split)
## US splits as RL, agree=0.625, adj=0.10, (0 split)
## Income < 88 to the left, agree=0.604, adj=0.05, (0 split)
##
## Node number 19: 34 observations, complexity param=0.04938272
## predicted class=Yes expected loss=0.3823529 P(node) =0.17
## class counts: 13 21
## probabilities: 0.382 0.618
## left son=38 (7 obs) right son=39 (27 obs)
## Primary splits:
## CompPrice < 115.5 to the left, improve=3.974167, (0 missing)
## Population < 125 to the left, improve=3.827712, (0 missing)
## Advertising < 10.5 to the left, improve=3.328999, (0 missing)
## US splits as LR, improve=2.285930, (0 missing)
## Income < 95.5 to the left, improve=1.801046, (0 missing)
## Surrogate splits:
## Income < 34.5 to the left, agree=0.853, adj=0.286, (0 split)
## Population < 39 to the left, agree=0.824, adj=0.143, (0 split)
##
## Node number 22: 4 observations, complexity param=0.01234568
## predicted class=No expected loss=0.25 P(node) =0.02
## class counts: 3 1
## probabilities: 0.750 0.250
## left son=44 (3 obs) right son=45 (1 obs)
## Primary splits:
## Income < 22.5 to the right, improve=1.5, (0 missing)
## Advertising < 13 to the left, improve=1.5, (0 missing)
## Population < 410.5 to the left, improve=1.5, (0 missing)
## Age < 46.5 to the right, improve=1.5, (0 missing)
## CompPrice < 121.5 to the left, improve=0.5, (0 missing)
##
## Node number 23: 18 observations
## predicted class=Yes expected loss=0.1111111 P(node) =0.09
## class counts: 2 16
## probabilities: 0.111 0.889
##
## Node number 24: 5 observations
## predicted class=No expected loss=0 P(node) =0.025
## class counts: 5 0
## probabilities: 1.000 0.000
##
## Node number 25: 3 observations, complexity param=0.01234568
## predicted class=Yes expected loss=0.3333333 P(node) =0.015
## class counts: 1 2
## probabilities: 0.333 0.667
## left son=50 (1 obs) right son=51 (2 obs)
## Primary splits:
## CompPrice < 116.5 to the right, improve=1.333333, (0 missing)
## Income < 60 to the left, improve=1.333333, (0 missing)
## Advertising < 18 to the right, improve=1.333333, (0 missing)
## Price < 89.5 to the right, improve=1.333333, (0 missing)
## ShelveLoc splits as LRR, improve=1.333333, (0 missing)
##
## Node number 28: 2 observations
## predicted class=No expected loss=0 P(node) =0.01
## class counts: 2 0
## probabilities: 1.000 0.000
##
## Node number 29: 1 observations
## predicted class=Yes expected loss=0 P(node) =0.005
## class counts: 0 1
## probabilities: 0.000 1.000
##
## Node number 30: 5 observations, complexity param=0.01234568
## predicted class=Yes expected loss=0.4 P(node) =0.025
## class counts: 2 3
## probabilities: 0.400 0.600
## left son=60 (2 obs) right son=61 (3 obs)
## Primary splits:
## Population < 120 to the left, improve=2.400000, (0 missing)
## CompPrice < 116.5 to the right, improve=1.066667, (0 missing)
## Income < 83.5 to the left, improve=1.066667, (0 missing)
## Advertising < 2.5 to the left, improve=1.066667, (0 missing)
## Education < 14.5 to the right, improve=1.066667, (0 missing)
## Surrogate splits:
## CompPrice < 116.5 to the right, agree=0.8, adj=0.5, (0 split)
## Income < 83.5 to the left, agree=0.8, adj=0.5, (0 split)
## Advertising < 2.5 to the left, agree=0.8, adj=0.5, (0 split)
## Education < 14.5 to the right, agree=0.8, adj=0.5, (0 split)
##
## Node number 31: 21 observations
## predicted class=Yes expected loss=0.04761905 P(node) =0.105
## class counts: 1 20
## probabilities: 0.048 0.952
##
## Node number 32: 50 observations
## predicted class=No expected loss=0.04 P(node) =0.25
## class counts: 48 2
## probabilities: 0.960 0.040
##
## Node number 33: 1 observations
## predicted class=Yes expected loss=0 P(node) =0.005
## class counts: 0 1
## probabilities: 0.000 1.000
##
## Node number 36: 28 observations
## predicted class=No expected loss=0.07142857 P(node) =0.14
## class counts: 26 2
## probabilities: 0.929 0.071
##
## Node number 37: 20 observations, complexity param=0.01851852
## predicted class=No expected loss=0.35 P(node) =0.1
## class counts: 13 7
## probabilities: 0.650 0.350
## left son=74 (13 obs) right son=75 (7 obs)
## Primary splits:
## Advertising < 5 to the left, improve=2.8582420, (0 missing)
## Price < 115.5 to the right, improve=2.5000000, (0 missing)
## Population < 442.5 to the left, improve=1.8777780, (0 missing)
## ShelveLoc splits as L-R, improve=1.6333330, (0 missing)
## Age < 59 to the left, improve=0.9241758, (0 missing)
## Surrogate splits:
## US splits as LR, agree=0.85, adj=0.571, (0 split)
## Population < 413 to the left, agree=0.75, adj=0.286, (0 split)
## Education < 10.5 to the right, agree=0.75, adj=0.286, (0 split)
## Urban splits as RL, agree=0.75, adj=0.286, (0 split)
## CompPrice < 131 to the right, agree=0.70, adj=0.143, (0 split)
##
## Node number 38: 7 observations, complexity param=0.01234568
## predicted class=No expected loss=0.1428571 P(node) =0.035
## class counts: 6 1
## probabilities: 0.857 0.143
## left son=76 (6 obs) right son=77 (1 obs)
## Primary splits:
## Income < 102.5 to the left, improve=1.7142860, (0 missing)
## Population < 392 to the left, improve=1.7142860, (0 missing)
## Advertising < 12 to the left, improve=0.7142857, (0 missing)
## Price < 105 to the right, improve=0.7142857, (0 missing)
## Age < 39.5 to the left, improve=0.7142857, (0 missing)
##
## Node number 39: 27 observations, complexity param=0.02469136
## predicted class=Yes expected loss=0.2592593 P(node) =0.135
## class counts: 7 20
## probabilities: 0.259 0.741
## left son=78 (6 obs) right son=79 (21 obs)
## Primary splits:
## Population < 110 to the left, improve=2.560847, (0 missing)
## Advertising < 8.5 to the left, improve=2.051689, (0 missing)
## CompPrice < 130.5 to the left, improve=1.667074, (0 missing)
## Price < 119.5 to the left, improve=1.270370, (0 missing)
## Age < 33 to the right, improve=1.037037, (0 missing)
## Surrogate splits:
## Education < 15.5 to the right, agree=0.852, adj=0.333, (0 split)
##
## Node number 44: 3 observations
## predicted class=No expected loss=0 P(node) =0.015
## class counts: 3 0
## probabilities: 1.000 0.000
##
## Node number 45: 1 observations
## predicted class=Yes expected loss=0 P(node) =0.005
## class counts: 0 1
## probabilities: 0.000 1.000
##
## Node number 50: 1 observations
## predicted class=No expected loss=0 P(node) =0.005
## class counts: 1 0
## probabilities: 1.000 0.000
##
## Node number 51: 2 observations
## predicted class=Yes expected loss=0 P(node) =0.01
## class counts: 0 2
## probabilities: 0.000 1.000
##
## Node number 60: 2 observations
## predicted class=No expected loss=0 P(node) =0.01
## class counts: 2 0
## probabilities: 1.000 0.000
##
## Node number 61: 3 observations
## predicted class=Yes expected loss=0 P(node) =0.015
## class counts: 0 3
## probabilities: 0.000 1.000
##
## Node number 74: 13 observations, complexity param=0.01234568
## predicted class=No expected loss=0.1538462 P(node) =0.065
## class counts: 11 2
## probabilities: 0.846 0.154
## left son=148 (8 obs) right son=149 (5 obs)
## Primary splits:
## Price < 112 to the right, improve=0.9846154, (0 missing)
## Population < 106 to the right, improve=0.7179487, (0 missing)
## CompPrice < 133.5 to the left, improve=0.5274725, (0 missing)
## Age < 60.5 to the right, improve=0.5274725, (0 missing)
## ShelveLoc splits as L-R, improve=0.3846154, (0 missing)
## Surrogate splits:
## CompPrice < 127.5 to the right, agree=0.769, adj=0.4, (0 split)
## Income < 95 to the left, agree=0.769, adj=0.4, (0 split)
## Population < 208.5 to the left, agree=0.692, adj=0.2, (0 split)
##
## Node number 75: 7 observations, complexity param=0.01234568
## predicted class=Yes expected loss=0.2857143 P(node) =0.035
## class counts: 2 5
## probabilities: 0.286 0.714
## left son=150 (1 obs) right son=151 (6 obs)
## Primary splits:
## Income < 101 to the right, improve=1.1904760, (0 missing)
## Age < 55 to the left, improve=1.1904760, (0 missing)
## Education < 15 to the right, improve=1.1904760, (0 missing)
## Urban splits as LR, improve=0.8571429, (0 missing)
## CompPrice < 128.5 to the right, improve=0.4571429, (0 missing)
##
## Node number 76: 6 observations
## predicted class=No expected loss=0 P(node) =0.03
## class counts: 6 0
## probabilities: 1.000 0.000
##
## Node number 77: 1 observations
## predicted class=Yes expected loss=0 P(node) =0.005
## class counts: 0 1
## probabilities: 0.000 1.000
##
## Node number 78: 6 observations, complexity param=0.01234568
## predicted class=No expected loss=0.3333333 P(node) =0.03
## class counts: 4 2
## probabilities: 0.667 0.333
## left son=156 (3 obs) right son=157 (3 obs)
## Primary splits:
## CompPrice < 131.5 to the left, improve=1.333333, (0 missing)
## Income < 95 to the left, improve=1.066667, (0 missing)
## Advertising < 6.5 to the left, improve=1.066667, (0 missing)
## Population < 33.5 to the right, improve=1.066667, (0 missing)
## Age < 33 to the right, improve=1.066667, (0 missing)
## Surrogate splits:
## Advertising < 1 to the left, agree=0.833, adj=0.667, (0 split)
## Price < 111.5 to the left, agree=0.833, adj=0.667, (0 split)
## Income < 78.5 to the left, agree=0.667, adj=0.333, (0 split)
## Population < 66.5 to the left, agree=0.667, adj=0.333, (0 split)
## ShelveLoc splits as L-R, agree=0.667, adj=0.333, (0 split)
##
## Node number 79: 21 observations, complexity param=0.01851852
## predicted class=Yes expected loss=0.1428571 P(node) =0.105
## class counts: 3 18
## probabilities: 0.143 0.857
## left son=158 (7 obs) right son=159 (14 obs)
## Primary splits:
## Population < 385.5 to the right, improve=1.7142860, (0 missing)
## CompPrice < 129.5 to the left, improve=0.9428571, (0 missing)
## ShelveLoc splits as L-R, improve=0.8678571, (0 missing)
## Education < 10.5 to the left, improve=0.8678571, (0 missing)
## Income < 41.5 to the left, improve=0.5639098, (0 missing)
## Surrogate splits:
## Advertising < 18.5 to the right, agree=0.81, adj=0.429, (0 split)
##
## Node number 148: 8 observations
## predicted class=No expected loss=0 P(node) =0.04
## class counts: 8 0
## probabilities: 1.000 0.000
##
## Node number 149: 5 observations, complexity param=0.01234568
## predicted class=No expected loss=0.4 P(node) =0.025
## class counts: 3 2
## probabilities: 0.600 0.400
## left son=298 (3 obs) right son=299 (2 obs)
## Primary splits:
## CompPrice < 133 to the left, improve=2.400000, (0 missing)
## Population < 184.5 to the right, improve=2.400000, (0 missing)
## Price < 101.5 to the left, improve=1.066667, (0 missing)
## ShelveLoc splits as L-R, improve=1.066667, (0 missing)
## Age < 61 to the right, improve=1.066667, (0 missing)
## Surrogate splits:
## Population < 184.5 to the right, agree=1.0, adj=1.0, (0 split)
## Price < 101.5 to the left, agree=0.8, adj=0.5, (0 split)
## Age < 61 to the right, agree=0.8, adj=0.5, (0 split)
##
## Node number 150: 1 observations
## predicted class=No expected loss=0 P(node) =0.005
## class counts: 1 0
## probabilities: 1.000 0.000
##
## Node number 151: 6 observations, complexity param=0.01234568
## predicted class=Yes expected loss=0.1666667 P(node) =0.03
## class counts: 1 5
## probabilities: 0.167 0.833
## left son=302 (1 obs) right son=303 (5 obs)
## Primary splits:
## Age < 55 to the left, improve=1.6666670, (0 missing)
## Education < 15 to the right, improve=1.6666670, (0 missing)
## Advertising < 10.5 to the left, improve=0.6666667, (0 missing)
## CompPrice < 129.5 to the left, improve=0.3333333, (0 missing)
## Income < 65.5 to the right, improve=0.3333333, (0 missing)
##
## Node number 156: 3 observations
## predicted class=No expected loss=0 P(node) =0.015
## class counts: 3 0
## probabilities: 1.000 0.000
##
## Node number 157: 3 observations, complexity param=0.01234568
## predicted class=Yes expected loss=0.3333333 P(node) =0.015
## class counts: 1 2
## probabilities: 0.333 0.667
## left son=314 (1 obs) right son=315 (2 obs)
## Primary splits:
## Price < 118 to the right, improve=1.3333330, (0 missing)
## Education < 13.5 to the left, improve=1.3333330, (0 missing)
## CompPrice < 134 to the right, improve=0.3333333, (0 missing)
## Income < 78 to the right, improve=0.3333333, (0 missing)
## Advertising < 1 to the right, improve=0.3333333, (0 missing)
##
## Node number 158: 7 observations, complexity param=0.01851852
## predicted class=Yes expected loss=0.4285714 P(node) =0.035
## class counts: 3 4
## probabilities: 0.429 0.571
## left son=316 (3 obs) right son=317 (4 obs)
## Primary splits:
## Population < 435 to the left, improve=3.428571, (0 missing)
## CompPrice < 132 to the left, improve=1.928571, (0 missing)
## ShelveLoc splits as L-R, improve=1.828571, (0 missing)
## Education < 10.5 to the left, improve=1.828571, (0 missing)
## Price < 119.5 to the left, improve=1.028571, (0 missing)
## Surrogate splits:
## CompPrice < 132 to the left, agree=0.857, adj=0.667, (0 split)
## Education < 10.5 to the left, agree=0.857, adj=0.667, (0 split)
## Advertising < 11 to the left, agree=0.714, adj=0.333, (0 split)
## Price < 118 to the left, agree=0.714, adj=0.333, (0 split)
## Age < 41.5 to the right, agree=0.714, adj=0.333, (0 split)
##
## Node number 159: 14 observations
## predicted class=Yes expected loss=0 P(node) =0.07
## class counts: 0 14
## probabilities: 0.000 1.000
##
## Node number 298: 3 observations
## predicted class=No expected loss=0 P(node) =0.015
## class counts: 3 0
## probabilities: 1.000 0.000
##
## Node number 299: 2 observations
## predicted class=Yes expected loss=0 P(node) =0.01
## class counts: 0 2
## probabilities: 0.000 1.000
##
## Node number 302: 1 observations
## predicted class=No expected loss=0 P(node) =0.005
## class counts: 1 0
## probabilities: 1.000 0.000
##
## Node number 303: 5 observations
## predicted class=Yes expected loss=0 P(node) =0.025
## class counts: 0 5
## probabilities: 0.000 1.000
##
## Node number 314: 1 observations
## predicted class=No expected loss=0 P(node) =0.005
## class counts: 1 0
## probabilities: 1.000 0.000
##
## Node number 315: 2 observations
## predicted class=Yes expected loss=0 P(node) =0.01
## class counts: 0 2
## probabilities: 0.000 1.000
##
## Node number 316: 3 observations
## predicted class=No expected loss=0 P(node) =0.015
## class counts: 3 0
## probabilities: 1.000 0.000
##
## Node number 317: 4 observations
## predicted class=Yes expected loss=0 P(node) =0.02
## class counts: 0 4
## probabilities: 0.000 1.000
# plot tree
par(mfrow=c(1,1))
plot(tree.carseats , uniform=TRUE,margin=0.2,
main="Classification Tree for Carseat Sales")
text(tree.carseats , use.n=TRUE, all=TRUE, cex=.8)
We can change this so the minimum per leaf is 10.
tree.carseats2<-rpart(High~.-Sales, data=Carseats2,
subset=train,
control=rpart.control(minsplit=10),
method="class")
summary(tree.carseats2)
## Call:
## rpart(formula = High ~ . - Sales, data = Carseats2, subset = train,
## method = "class", control = rpart.control(minsplit = 10))
## n= 200
##
## CP nsplit rel error xerror xstd
## 1 0.22222222 0 1.0000000 1.0000000 0.08570694
## 2 0.11111111 1 0.7777778 0.9753086 0.08535053
## 3 0.04938272 2 0.6666667 0.9135802 0.08429486
## 4 0.03703704 5 0.5061728 0.9259259 0.08452509
## 5 0.02469136 6 0.4691358 1.0000000 0.08570694
## 6 0.01851852 10 0.3703704 0.9876543 0.08553337
## 7 0.01234568 12 0.3333333 0.9876543 0.08553337
## 8 0.01000000 16 0.2839506 0.9629630 0.08515834
##
## Variable importance
## Price Age CompPrice Income Population ShelveLoc
## 26 16 15 10 9 9
## Advertising Education US Urban
## 8 4 2 1
##
## Node number 1: 200 observations, complexity param=0.2222222
## predicted class=No expected loss=0.405 P(node) =1
## class counts: 119 81
## probabilities: 0.595 0.405
## left son=2 (160 obs) right son=3 (40 obs)
## Primary splits:
## Price < 96.5 to the right, improve=10.240000, (0 missing)
## ShelveLoc splits as LRL, improve= 9.825674, (0 missing)
## Advertising < 7.5 to the left, improve= 7.885091, (0 missing)
## Age < 60.5 to the right, improve= 5.520667, (0 missing)
## Income < 61.5 to the left, improve= 5.415000, (0 missing)
## Surrogate splits:
## CompPrice < 101 to the right, agree=0.835, adj=0.175, (0 split)
## Population < 502 to the left, agree=0.805, adj=0.025, (0 split)
##
## Node number 2: 160 observations, complexity param=0.1111111
## predicted class=No expected loss=0.325 P(node) =0.8
## class counts: 108 52
## probabilities: 0.675 0.325
## left son=4 (135 obs) right son=5 (25 obs)
## Primary splits:
## ShelveLoc splits as LRL, improve=7.468148, (0 missing)
## Advertising < 6.5 to the left, improve=6.050000, (0 missing)
## Income < 61.5 to the left, improve=4.907110, (0 missing)
## Price < 137 to the right, improve=4.250179, (0 missing)
## Age < 60.5 to the right, improve=3.498223, (0 missing)
##
## Node number 3: 40 observations, complexity param=0.02469136
## predicted class=Yes expected loss=0.275 P(node) =0.2
## class counts: 11 29
## probabilities: 0.275 0.725
## left son=6 (11 obs) right son=7 (29 obs)
## Primary splits:
## Age < 64.5 to the right, improve=2.219592, (0 missing)
## Advertising < 7 to the left, improve=1.758081, (0 missing)
## Population < 414 to the left, improve=1.756452, (0 missing)
## ShelveLoc splits as LRL, improve=1.512500, (0 missing)
## US splits as LR, improve=1.106010, (0 missing)
##
## Node number 4: 135 observations, complexity param=0.04938272
## predicted class=No expected loss=0.2592593 P(node) =0.675
## class counts: 100 35
## probabilities: 0.741 0.259
## left son=8 (53 obs) right son=9 (82 obs)
## Primary splits:
## Price < 124.5 to the right, improve=4.746468, (0 missing)
## Advertising < 7.5 to the left, improve=4.561486, (0 missing)
## CompPrice < 143.5 to the left, improve=3.383383, (0 missing)
## Population < 498 to the left, improve=3.367003, (0 missing)
## Age < 49.5 to the right, improve=3.325185, (0 missing)
## Surrogate splits:
## CompPrice < 146 to the right, agree=0.674, adj=0.170, (0 split)
## Population < 478.5 to the right, agree=0.630, adj=0.057, (0 split)
## Advertising < 24 to the right, agree=0.622, adj=0.038, (0 split)
## Age < 28.5 to the left, agree=0.622, adj=0.038, (0 split)
## Education < 17.5 to the right, agree=0.622, adj=0.038, (0 split)
##
## Node number 5: 25 observations, complexity param=0.03703704
## predicted class=Yes expected loss=0.32 P(node) =0.125
## class counts: 8 17
## probabilities: 0.320 0.680
## left son=10 (3 obs) right son=11 (22 obs)
## Primary splits:
## Price < 157 to the right, improve=3.152727, (0 missing)
## Income < 43 to the left, improve=3.022857, (0 missing)
## Age < 68.5 to the right, improve=2.188824, (0 missing)
## US splits as LR, improve=2.188824, (0 missing)
## Advertising < 6 to the left, improve=1.996883, (0 missing)
##
## Node number 6: 11 observations, complexity param=0.02469136
## predicted class=No expected loss=0.4545455 P(node) =0.055
## class counts: 6 5
## probabilities: 0.545 0.455
## left son=12 (8 obs) right son=13 (3 obs)
## Primary splits:
## Population < 339 to the left, improve=2.454545, (0 missing)
## CompPrice < 117 to the right, improve=1.704545, (0 missing)
## Price < 90.5 to the right, improve=1.704545, (0 missing)
## Advertising < 8 to the left, improve=1.187879, (0 missing)
## Income < 68.5 to the right, improve=1.097403, (0 missing)
## Surrogate splits:
## Income < 91.5 to the left, agree=0.818, adj=0.333, (0 split)
##
## Node number 7: 29 observations, complexity param=0.01234568
## predicted class=Yes expected loss=0.1724138 P(node) =0.145
## class counts: 5 24
## probabilities: 0.172 0.828
## left son=14 (3 obs) right son=15 (26 obs)
## Primary splits:
## Income < 26.5 to the left, improve=1.6348360, (0 missing)
## ShelveLoc splits as LRR, improve=1.6236880, (0 missing)
## Advertising < 2 to the left, improve=1.6091950, (0 missing)
## Education < 13.5 to the right, improve=1.4008620, (0 missing)
## US splits as LR, improve=0.8624005, (0 missing)
##
## Node number 8: 53 observations, complexity param=0.01234568
## predicted class=No expected loss=0.09433962 P(node) =0.265
## class counts: 48 5
## probabilities: 0.906 0.094
## left son=16 (50 obs) right son=17 (3 obs)
## Primary splits:
## Advertising < 20 to the left, improve=2.083270, (0 missing)
## CompPrice < 148.5 to the left, improve=2.013314, (0 missing)
## Population < 393.5 to the left, improve=1.688183, (0 missing)
## Income < 30 to the right, improve=1.031604, (0 missing)
## ShelveLoc splits as L-R, improve=0.527192, (0 missing)
##
## Node number 9: 82 observations, complexity param=0.04938272
## predicted class=No expected loss=0.3658537 P(node) =0.41
## class counts: 52 30
## probabilities: 0.634 0.366
## left son=18 (48 obs) right son=19 (34 obs)
## Primary splits:
## Age < 49.5 to the right, improve=7.364957, (0 missing)
## CompPrice < 129.5 to the left, improve=5.827570, (0 missing)
## Advertising < 10.5 to the left, improve=5.601642, (0 missing)
## Income < 51.5 to the left, improve=2.627905, (0 missing)
## Population < 440.5 to the left, improve=1.859280, (0 missing)
## Surrogate splits:
## Price < 118.5 to the left, agree=0.646, adj=0.147, (0 split)
## Income < 75.5 to the left, agree=0.634, adj=0.118, (0 split)
## CompPrice < 143 to the left, agree=0.622, adj=0.088, (0 split)
## Advertising < 10.5 to the left, agree=0.622, adj=0.088, (0 split)
## Population < 496 to the left, agree=0.610, adj=0.059, (0 split)
##
## Node number 10: 3 observations
## predicted class=No expected loss=0 P(node) =0.015
## class counts: 3 0
## probabilities: 1.000 0.000
##
## Node number 11: 22 observations, complexity param=0.02469136
## predicted class=Yes expected loss=0.2272727 P(node) =0.11
## class counts: 5 17
## probabilities: 0.227 0.773
## left son=22 (4 obs) right son=23 (18 obs)
## Primary splits:
## Income < 35.5 to the left, improve=2.671717, (0 missing)
## US splits as LR, improve=2.432035, (0 missing)
## Advertising < 1 to the left, improve=1.870130, (0 missing)
## Education < 14.5 to the left, improve=1.573427, (0 missing)
## Age < 61.5 to the right, improve=1.436674, (0 missing)
##
## Node number 12: 8 observations
## predicted class=No expected loss=0.25 P(node) =0.04
## class counts: 6 2
## probabilities: 0.750 0.250
##
## Node number 13: 3 observations
## predicted class=Yes expected loss=0 P(node) =0.015
## class counts: 0 3
## probabilities: 0.000 1.000
##
## Node number 14: 3 observations
## predicted class=No expected loss=0.3333333 P(node) =0.015
## class counts: 2 1
## probabilities: 0.667 0.333
##
## Node number 15: 26 observations
## predicted class=Yes expected loss=0.1153846 P(node) =0.13
## class counts: 3 23
## probabilities: 0.115 0.885
##
## Node number 16: 50 observations, complexity param=0.01234568
## predicted class=No expected loss=0.06 P(node) =0.25
## class counts: 47 3
## probabilities: 0.940 0.060
## left son=32 (40 obs) right son=33 (10 obs)
## Primary splits:
## CompPrice < 148.5 to the left, improve=1.4400000, (0 missing)
## Population < 398 to the left, improve=0.8400000, (0 missing)
## Income < 29.5 to the right, improve=0.3139130, (0 missing)
## Education < 10.5 to the right, improve=0.3139130, (0 missing)
## Age < 47.5 to the left, improve=0.3066667, (0 missing)
## Surrogate splits:
## Price < 159.5 to the left, agree=0.82, adj=0.1, (0 split)
## Age < 26.5 to the right, agree=0.82, adj=0.1, (0 split)
##
## Node number 17: 3 observations
## predicted class=Yes expected loss=0.3333333 P(node) =0.015
## class counts: 1 2
## probabilities: 0.333 0.667
##
## Node number 18: 48 observations, complexity param=0.01851852
## predicted class=No expected loss=0.1875 P(node) =0.24
## class counts: 39 9
## probabilities: 0.813 0.188
## left son=36 (28 obs) right son=37 (20 obs)
## Primary splits:
## CompPrice < 124.5 to the left, improve=1.8107140, (0 missing)
## Advertising < 6.5 to the left, improve=1.0351630, (0 missing)
## Population < 442.5 to the left, improve=0.9525261, (0 missing)
## Price < 118.5 to the right, improve=0.6750000, (0 missing)
## Education < 10.5 to the right, improve=0.6750000, (0 missing)
## Surrogate splits:
## Price < 111.5 to the left, agree=0.729, adj=0.35, (0 split)
## Age < 60.5 to the right, agree=0.646, adj=0.15, (0 split)
## Population < 38 to the right, agree=0.625, adj=0.10, (0 split)
## US splits as RL, agree=0.625, adj=0.10, (0 split)
## Income < 88 to the left, agree=0.604, adj=0.05, (0 split)
##
## Node number 19: 34 observations, complexity param=0.04938272
## predicted class=Yes expected loss=0.3823529 P(node) =0.17
## class counts: 13 21
## probabilities: 0.382 0.618
## left son=38 (7 obs) right son=39 (27 obs)
## Primary splits:
## CompPrice < 115.5 to the left, improve=3.974167, (0 missing)
## Population < 125 to the left, improve=3.827712, (0 missing)
## Advertising < 10.5 to the left, improve=3.328999, (0 missing)
## US splits as LR, improve=2.285930, (0 missing)
## Income < 95.5 to the left, improve=1.801046, (0 missing)
## Surrogate splits:
## Income < 34.5 to the left, agree=0.853, adj=0.286, (0 split)
## Population < 39 to the left, agree=0.824, adj=0.143, (0 split)
##
## Node number 22: 4 observations
## predicted class=No expected loss=0.25 P(node) =0.02
## class counts: 3 1
## probabilities: 0.750 0.250
##
## Node number 23: 18 observations
## predicted class=Yes expected loss=0.1111111 P(node) =0.09
## class counts: 2 16
## probabilities: 0.111 0.889
##
## Node number 32: 40 observations
## predicted class=No expected loss=0 P(node) =0.2
## class counts: 40 0
## probabilities: 1.000 0.000
##
## Node number 33: 10 observations, complexity param=0.01234568
## predicted class=No expected loss=0.3 P(node) =0.05
## class counts: 7 3
## probabilities: 0.700 0.300
## left son=66 (6 obs) right son=67 (4 obs)
## Primary splits:
## Age < 47 to the left, improve=2.700000, (0 missing)
## Population < 390.5 to the left, improve=1.800000, (0 missing)
## CompPrice < 151.5 to the right, improve=1.200000, (0 missing)
## Price < 147 to the right, improve=1.200000, (0 missing)
## Income < 83 to the left, improve=1.152381, (0 missing)
## Surrogate splits:
## CompPrice < 151.5 to the right, agree=0.8, adj=0.50, (0 split)
## Price < 147 to the right, agree=0.8, adj=0.50, (0 split)
## Education < 15.5 to the right, agree=0.8, adj=0.50, (0 split)
## Income < 83 to the left, agree=0.7, adj=0.25, (0 split)
## Advertising < 4 to the left, agree=0.7, adj=0.25, (0 split)
##
## Node number 36: 28 observations
## predicted class=No expected loss=0.07142857 P(node) =0.14
## class counts: 26 2
## probabilities: 0.929 0.071
##
## Node number 37: 20 observations, complexity param=0.01851852
## predicted class=No expected loss=0.35 P(node) =0.1
## class counts: 13 7
## probabilities: 0.650 0.350
## left son=74 (13 obs) right son=75 (7 obs)
## Primary splits:
## Advertising < 5 to the left, improve=2.8582420, (0 missing)
## Price < 115.5 to the right, improve=2.5000000, (0 missing)
## ShelveLoc splits as L-R, improve=1.6333330, (0 missing)
## Age < 59 to the left, improve=0.9241758, (0 missing)
## Education < 14.5 to the right, improve=0.9241758, (0 missing)
## Surrogate splits:
## US splits as LR, agree=0.85, adj=0.571, (0 split)
## Population < 413 to the left, agree=0.75, adj=0.286, (0 split)
## Education < 10.5 to the right, agree=0.75, adj=0.286, (0 split)
## Urban splits as RL, agree=0.75, adj=0.286, (0 split)
## CompPrice < 131 to the right, agree=0.70, adj=0.143, (0 split)
##
## Node number 38: 7 observations
## predicted class=No expected loss=0.1428571 P(node) =0.035
## class counts: 6 1
## probabilities: 0.857 0.143
##
## Node number 39: 27 observations, complexity param=0.02469136
## predicted class=Yes expected loss=0.2592593 P(node) =0.135
## class counts: 7 20
## probabilities: 0.259 0.741
## left son=78 (6 obs) right son=79 (21 obs)
## Primary splits:
## Population < 110 to the left, improve=2.560847, (0 missing)
## Advertising < 8.5 to the left, improve=2.051689, (0 missing)
## CompPrice < 130.5 to the left, improve=1.667074, (0 missing)
## Price < 119.5 to the left, improve=1.270370, (0 missing)
## Age < 33 to the right, improve=1.037037, (0 missing)
## Surrogate splits:
## Education < 15.5 to the right, agree=0.852, adj=0.333, (0 split)
##
## Node number 66: 6 observations
## predicted class=No expected loss=0 P(node) =0.03
## class counts: 6 0
## probabilities: 1.000 0.000
##
## Node number 67: 4 observations
## predicted class=Yes expected loss=0.25 P(node) =0.02
## class counts: 1 3
## probabilities: 0.250 0.750
##
## Node number 74: 13 observations
## predicted class=No expected loss=0.1538462 P(node) =0.065
## class counts: 11 2
## probabilities: 0.846 0.154
##
## Node number 75: 7 observations
## predicted class=Yes expected loss=0.2857143 P(node) =0.035
## class counts: 2 5
## probabilities: 0.286 0.714
##
## Node number 78: 6 observations
## predicted class=No expected loss=0.3333333 P(node) =0.03
## class counts: 4 2
## probabilities: 0.667 0.333
##
## Node number 79: 21 observations
## predicted class=Yes expected loss=0.1428571 P(node) =0.105
## class counts: 3 18
## probabilities: 0.143 0.857
# plot tree
par(mfrow=c(1,1))
plot(tree.carseats2 , uniform=TRUE,margin=0.2,
main="Classification Tree for Carseat Sales")
text(tree.carseats2 , use.n=TRUE, all=TRUE, cex=.8)
#install.packages("rpart.plot")
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.6.2
prp(tree.carseats2, faclen = 0, cex = 0.7, extra=1, space=.5)
#Using cross-validation to select complexity
printcp(tree.carseats2 ) # display the results
##
## Classification tree:
## rpart(formula = High ~ . - Sales, data = Carseats2, subset = train,
## method = "class", control = rpart.control(minsplit = 10))
##
## Variables actually used in tree construction:
## [1] Advertising Age CompPrice Income Population Price
## [7] ShelveLoc
##
## Root node error: 81/200 = 0.405
##
## n= 200
##
## CP nsplit rel error xerror xstd
## 1 0.222222 0 1.00000 1.00000 0.085707
## 2 0.111111 1 0.77778 0.97531 0.085351
## 3 0.049383 2 0.66667 0.91358 0.084295
## 4 0.037037 5 0.50617 0.92593 0.084525
## 5 0.024691 6 0.46914 1.00000 0.085707
## 6 0.018519 10 0.37037 0.98765 0.085533
## 7 0.012346 12 0.33333 0.98765 0.085533
## 8 0.010000 16 0.28395 0.96296 0.085158
plotcp(tree.carseats2) # visualize cross-validation results
# create additional plots
par(mfrow=c(1,2)) # two plots on one page
rsq.rpart(tree.carseats2) # visualize cross-validation results
##
## Classification tree:
## rpart(formula = High ~ . - Sales, data = Carseats2, subset = train,
## method = "class", control = rpart.control(minsplit = 10))
##
## Variables actually used in tree construction:
## [1] Advertising Age CompPrice Income Population Price
## [7] ShelveLoc
##
## Root node error: 81/200 = 0.405
##
## n= 200
##
## CP nsplit rel error xerror xstd
## 1 0.222222 0 1.00000 1.00000 0.085707
## 2 0.111111 1 0.77778 0.97531 0.085351
## 3 0.049383 2 0.66667 0.91358 0.084295
## 4 0.037037 5 0.50617 0.92593 0.084525
## 5 0.024691 6 0.46914 1.00000 0.085707
## 6 0.018519 10 0.37037 0.98765 0.085533
## 7 0.012346 12 0.33333 0.98765 0.085533
## 8 0.010000 16 0.28395 0.96296 0.085158
## Warning in rsq.rpart(tree.carseats2): may not be applicable for this method
# minimize the error
tree.carseats2$cptable
## CP nsplit rel error xerror xstd
## 1 0.22222222 0 1.0000000 1.0000000 0.08570694
## 2 0.11111111 1 0.7777778 0.9753086 0.08535053
## 3 0.04938272 2 0.6666667 0.9135802 0.08429486
## 4 0.03703704 5 0.5061728 0.9259259 0.08452509
## 5 0.02469136 6 0.4691358 1.0000000 0.08570694
## 6 0.01851852 10 0.3703704 0.9876543 0.08553337
## 7 0.01234568 12 0.3333333 0.9876543 0.08553337
## 8 0.01000000 16 0.2839506 0.9629630 0.08515834
which.min(tree.carseats2$cptable[,"xerror"])
## 3
## 3
tree.carseats2$cptable[which.min(tree.carseats2$cptable[,"xerror"]),"CP"]
## [1] 0.04938272
# prune the tree
prune.carseats<- prune(tree.carseats2, cp=0.04938272) # from cptable
# plot the pruned tree
par(mfrow=c(1,1))
plot(prune.carseats, uniform=TRUE,margin=0.2,
main="Pruned Regression Tree for Nsplit=2")
text(prune.carseats, use.n=TRUE, all=TRUE, cex=.8)
# prediction and confusion matrix
# what we observed
high.test<-High[-train]
# what we predict
tree.pred2<-predict(prune.carseats, newdata = Carseats2, type="class")
test.pred<-tree.pred2[-train]
cm<-table(test.pred, high.test)
cm
## high.test
## test.pred No Yes
## No 93 25
## Yes 24 58
# correct rate
diag(cm)
## No Yes
## 93 58
sum(diag(cm))/sum(cm)
## [1] 0.755
# error rate
1-sum(diag(cm))/sum(cm)
## [1] 0.245