cat("\014")
require(tree)
## Loading required package: tree
require(ISLR)
## Loading required package: ISLR
attach(Carseats)
hist(Sales) #using this variable to make high and classify it accordingly
High = ifelse(Sales<=8,"NO","YES")
Carseats = data.frame(Carseats, High)
head(Carseats)
## Sales CompPrice Income Advertising Population Price ShelveLoc Age
## 1 9.50 138 73 11 276 120 Bad 42
## 2 11.22 111 48 16 260 83 Good 65
## 3 10.06 113 35 10 269 80 Medium 59
## 4 7.40 117 100 4 466 97 Medium 55
## 5 4.15 141 64 3 340 128 Bad 38
## 6 10.81 124 113 13 501 72 Bad 78
## Education Urban US High
## 1 17 Yes Yes YES
## 2 10 Yes Yes YES
## 3 12 Yes Yes YES
## 4 14 Yes Yes NO
## 5 13 Yes No NO
## 6 16 No Yes YES
Exclude the sales from the tree formation as a criterion becoz it is used to classify.
tree.Carseats = tree(High~.-Sales,data=Carseats) #making the tree
summary(tree.Carseats)
##
## Classification tree:
## tree(formula = High ~ . - Sales, data = Carseats)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Income" "CompPrice" "Population"
## [6] "Advertising" "Age" "US"
## Number of terminal nodes: 27
## Residual mean deviance: 0.4575 = 170.7 / 373
## Misclassification error rate: 0.09 = 36 / 400
plot(tree.Carseats)
text(tree.Carseats,pretty=0,cex=0.6)
More details of tree
tree.Carseats
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 400 541.500 NO ( 0.59000 0.41000 )
## 2) ShelveLoc: Bad,Medium 315 390.600 NO ( 0.68889 0.31111 )
## 4) Price < 92.5 46 56.530 YES ( 0.30435 0.69565 )
## 8) Income < 57 10 12.220 NO ( 0.70000 0.30000 )
## 16) CompPrice < 110.5 5 0.000 NO ( 1.00000 0.00000 ) *
## 17) CompPrice > 110.5 5 6.730 YES ( 0.40000 0.60000 ) *
## 9) Income > 57 36 35.470 YES ( 0.19444 0.80556 )
## 18) Population < 207.5 16 21.170 YES ( 0.37500 0.62500 ) *
## 19) Population > 207.5 20 7.941 YES ( 0.05000 0.95000 ) *
## 5) Price > 92.5 269 299.800 NO ( 0.75465 0.24535 )
## 10) Advertising < 13.5 224 213.200 NO ( 0.81696 0.18304 )
## 20) CompPrice < 124.5 96 44.890 NO ( 0.93750 0.06250 )
## 40) Price < 106.5 38 33.150 NO ( 0.84211 0.15789 )
## 80) Population < 177 12 16.300 NO ( 0.58333 0.41667 )
## 160) Income < 60.5 6 0.000 NO ( 1.00000 0.00000 ) *
## 161) Income > 60.5 6 5.407 YES ( 0.16667 0.83333 ) *
## 81) Population > 177 26 8.477 NO ( 0.96154 0.03846 ) *
## 41) Price > 106.5 58 0.000 NO ( 1.00000 0.00000 ) *
## 21) CompPrice > 124.5 128 150.200 NO ( 0.72656 0.27344 )
## 42) Price < 122.5 51 70.680 YES ( 0.49020 0.50980 )
## 84) ShelveLoc: Bad 11 6.702 NO ( 0.90909 0.09091 ) *
## 85) ShelveLoc: Medium 40 52.930 YES ( 0.37500 0.62500 )
## 170) Price < 109.5 16 7.481 YES ( 0.06250 0.93750 ) *
## 171) Price > 109.5 24 32.600 NO ( 0.58333 0.41667 )
## 342) Age < 49.5 13 16.050 YES ( 0.30769 0.69231 ) *
## 343) Age > 49.5 11 6.702 NO ( 0.90909 0.09091 ) *
## 43) Price > 122.5 77 55.540 NO ( 0.88312 0.11688 )
## 86) CompPrice < 147.5 58 17.400 NO ( 0.96552 0.03448 ) *
## 87) CompPrice > 147.5 19 25.010 NO ( 0.63158 0.36842 )
## 174) Price < 147 12 16.300 YES ( 0.41667 0.58333 )
## 348) CompPrice < 152.5 7 5.742 YES ( 0.14286 0.85714 ) *
## 349) CompPrice > 152.5 5 5.004 NO ( 0.80000 0.20000 ) *
## 175) Price > 147 7 0.000 NO ( 1.00000 0.00000 ) *
## 11) Advertising > 13.5 45 61.830 YES ( 0.44444 0.55556 )
## 22) Age < 54.5 25 25.020 YES ( 0.20000 0.80000 )
## 44) CompPrice < 130.5 14 18.250 YES ( 0.35714 0.64286 )
## 88) Income < 100 9 12.370 NO ( 0.55556 0.44444 ) *
## 89) Income > 100 5 0.000 YES ( 0.00000 1.00000 ) *
## 45) CompPrice > 130.5 11 0.000 YES ( 0.00000 1.00000 ) *
## 23) Age > 54.5 20 22.490 NO ( 0.75000 0.25000 )
## 46) CompPrice < 122.5 10 0.000 NO ( 1.00000 0.00000 ) *
## 47) CompPrice > 122.5 10 13.860 NO ( 0.50000 0.50000 )
## 94) Price < 125 5 0.000 YES ( 0.00000 1.00000 ) *
## 95) Price > 125 5 0.000 NO ( 1.00000 0.00000 ) *
## 3) ShelveLoc: Good 85 90.330 YES ( 0.22353 0.77647 )
## 6) Price < 135 68 49.260 YES ( 0.11765 0.88235 )
## 12) US: No 17 22.070 YES ( 0.35294 0.64706 )
## 24) Price < 109 8 0.000 YES ( 0.00000 1.00000 ) *
## 25) Price > 109 9 11.460 NO ( 0.66667 0.33333 ) *
## 13) US: Yes 51 16.880 YES ( 0.03922 0.96078 ) *
## 7) Price > 135 17 22.070 NO ( 0.64706 0.35294 )
## 14) Income < 46 6 0.000 NO ( 1.00000 0.00000 ) *
## 15) Income > 46 11 15.160 YES ( 0.45455 0.54545 ) *
We are now dividing the 400 data sample into two parts 1. Training-250 2. Test-150.
set.seed(10)
train=sample(1:nrow(Carseats),250)
tree.Carseats=tree(High~.-Sales,Carseats,subset=train) #training data with the training dataset
plot(tree.Carseats)
text(tree.Carseats,pretty=0,cex=0.6)
tree.pred=predict(tree.Carseats,Carseats[-train,],type="class") #predict using the test data
with(Carseats[-train,],table(tree.pred,High))
## High
## tree.pred NO YES
## NO 67 17
## YES 19 47
Pruning the tree using Cross Validation
cv.Carseats = cv.tree(tree.Carseats,FUN=prune.misclass)
cv.Carseats
## $size
## [1] 23 19 15 12 7 6 4 3 2 1
##
## $dev
## [1] 68 68 67 63 64 64 64 65 70 100
##
## $k
## [1] -Inf 0.0 0.5 1.0 2.0 4.0 4.5 5.0 12.0 31.0
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(cv.Carseats)
prune.Carseats = prune.misclass(tree.Carseats,best=13)
plot(prune.Carseats)
text(prune.Carseats,pretty=0,cex=0.6)
Predict on the test data
tree.pred=predict(prune.Carseats,Carseats[-train,],type="class")
with(Carseats[-train,],table(tree.pred,High))
## High
## tree.pred NO YES
## NO 68 21
## YES 18 43