Sameer Mathur
Fitting Classification Trees
Here we are using Carseats Dataset Which is the Inbuilt dataset in ISLR Package. We are also using Tree Package
# loading the package in library
library(tree)
library(ISLR)
# attaching the inbuilt dataset
attach(Carseats)
# dimentions of the dataset
dim(Carseats)
[1] 400 11
# column names of the data set
colnames(Carseats)
[1] "Sales" "CompPrice" "Income" "Advertising" "Population"
[6] "Price" "ShelveLoc" "Age" "Education" "Urban"
[11] "US"
# structure of the dataset
str(Carseats)
'data.frame': 400 obs. of 11 variables:
$ Sales : num 9.5 11.22 10.06 7.4 4.15 ...
$ CompPrice : num 138 111 113 117 141 124 115 136 132 132 ...
$ Income : num 73 48 35 100 64 113 105 81 110 113 ...
$ Advertising: num 11 16 10 4 3 13 0 15 0 0 ...
$ Population : num 276 260 269 466 340 501 45 425 108 131 ...
$ Price : num 120 83 80 97 128 72 108 120 124 124 ...
$ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
$ Age : num 42 65 59 55 38 78 71 67 76 76 ...
$ Education : num 17 10 12 14 13 16 15 10 10 17 ...
$ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
$ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
# creating a variable "HIGH" with following condistions
High <- ifelse(Sales <= 8,"No","Yes")
# creating a dataframe with new variable High
Carseats <- data.frame(Carseats,High)
# attaching the dataframe
attach(Carseats)
# checking the column names
colnames(Carseats)
[1] "Sales" "CompPrice" "Income" "Advertising" "Population"
[6] "Price" "ShelveLoc" "Age" "Education" "Urban"
[11] "US" "High"
head(High)
[1] "Yes" "Yes" "Yes" "No" "No" "Yes"
# fitting decision tree
tree.carseats <- tree(High ~. -Sales, Carseats)
# summary of the model
summary(tree.carseats)
Classification tree:
tree(formula = High ~ . - Sales, data = Carseats)
Variables actually used in tree construction:
[1] "ShelveLoc" "Price" "Income" "CompPrice" "Population"
[6] "Advertising" "Age" "US"
Number of terminal nodes: 27
Residual mean deviance: 0.4575 = 170.7 / 373
Misclassification error rate: 0.09 = 36 / 400
node), split, n, deviance, yval, (yprob)
* denotes terminal node
1) root 400 541.500 No ( 0.59000 0.41000 )
2) ShelveLoc: Bad,Medium 315 390.600 No ( 0.68889 0.31111 )
4) Price < 92.5 46 56.530 Yes ( 0.30435 0.69565 )
8) Income < 57 10 12.220 No ( 0.70000 0.30000 )
16) CompPrice < 110.5 5 0.000 No ( 1.00000 0.00000 ) *
17) CompPrice > 110.5 5 6.730 Yes ( 0.40000 0.60000 ) *
9) Income > 57 36 35.470 Yes ( 0.19444 0.80556 )
18) Population < 207.5 16 21.170 Yes ( 0.37500 0.62500 ) *
19) Population > 207.5 20 7.941 Yes ( 0.05000 0.95000 ) *
5) Price > 92.5 269 299.800 No ( 0.75465 0.24535 )
10) Advertising < 13.5 224 213.200 No ( 0.81696 0.18304 )
20) CompPrice < 124.5 96 44.890 No ( 0.93750 0.06250 )
40) Price < 106.5 38 33.150 No ( 0.84211 0.15789 )
80) Population < 177 12 16.300 No ( 0.58333 0.41667 )
160) Income < 60.5 6 0.000 No ( 1.00000 0.00000 ) *
161) Income > 60.5 6 5.407 Yes ( 0.16667 0.83333 ) *
81) Population > 177 26 8.477 No ( 0.96154 0.03846 ) *
41) Price > 106.5 58 0.000 No ( 1.00000 0.00000 ) *
21) CompPrice > 124.5 128 150.200 No ( 0.72656 0.27344 )
42) Price < 122.5 51 70.680 Yes ( 0.49020 0.50980 )
84) ShelveLoc: Bad 11 6.702 No ( 0.90909 0.09091 ) *
85) ShelveLoc: Medium 40 52.930 Yes ( 0.37500 0.62500 )
170) Price < 109.5 16 7.481 Yes ( 0.06250 0.93750 ) *
171) Price > 109.5 24 32.600 No ( 0.58333 0.41667 )
342) Age < 49.5 13 16.050 Yes ( 0.30769 0.69231 ) *
343) Age > 49.5 11 6.702 No ( 0.90909 0.09091 ) *
43) Price > 122.5 77 55.540 No ( 0.88312 0.11688 )
86) CompPrice < 147.5 58 17.400 No ( 0.96552 0.03448 ) *
87) CompPrice > 147.5 19 25.010 No ( 0.63158 0.36842 )
174) Price < 147 12 16.300 Yes ( 0.41667 0.58333 )
348) CompPrice < 152.5 7 5.742 Yes ( 0.14286 0.85714 ) *
349) CompPrice > 152.5 5 5.004 No ( 0.80000 0.20000 ) *
175) Price > 147 7 0.000 No ( 1.00000 0.00000 ) *
11) Advertising > 13.5 45 61.830 Yes ( 0.44444 0.55556 )
22) Age < 54.5 25 25.020 Yes ( 0.20000 0.80000 )
44) CompPrice < 130.5 14 18.250 Yes ( 0.35714 0.64286 )
88) Income < 100 9 12.370 No ( 0.55556 0.44444 ) *
89) Income > 100 5 0.000 Yes ( 0.00000 1.00000 ) *
45) CompPrice > 130.5 11 0.000 Yes ( 0.00000 1.00000 ) *
23) Age > 54.5 20 22.490 No ( 0.75000 0.25000 )
46) CompPrice < 122.5 10 0.000 No ( 1.00000 0.00000 ) *
47) CompPrice > 122.5 10 13.860 No ( 0.50000 0.50000 )
94) Price < 125 5 0.000 Yes ( 0.00000 1.00000 ) *
95) Price > 125 5 0.000 No ( 1.00000 0.00000 ) *
3) ShelveLoc: Good 85 90.330 Yes ( 0.22353 0.77647 )
6) Price < 135 68 49.260 Yes ( 0.11765 0.88235 )
12) US: No 17 22.070 Yes ( 0.35294 0.64706 )
24) Price < 109 8 0.000 Yes ( 0.00000 1.00000 ) *
25) Price > 109 9 11.460 No ( 0.66667 0.33333 ) *
13) US: Yes 51 16.880 Yes ( 0.03922 0.96078 ) *
7) Price > 135 17 22.070 No ( 0.64706 0.35294 )
14) Income < 46 6 0.000 No ( 1.00000 0.00000 ) *
15) Income > 46 11 15.160 Yes ( 0.45455 0.54545 ) *
# fixing the observations in training and test sets
set.seed(2)
# taking the sample of 200 data points from Carseats dataset to create traing set
train <- sample(1:nrow(Carseats), 200)
# remaing data points are in test set
Carseats.test <- Carseats[-train,]
# taking test set of dependent variable ("High")
High.test <- High[-train]
# varifying the subsets
head(train)
[1] 74 281 229 67 374 373
head(High.test)
[1] "Yes" "Yes" "No" "No" "No" "Yes"
# fitting the model for decision tree
tree.carseats <- tree(High~.-Sales,Carseats,subset = train)
# printing the summary
summary(tree.carseats)
Classification tree:
tree(formula = High ~ . - Sales, data = Carseats, subset = train)
Variables actually used in tree construction:
[1] "ShelveLoc" "Price" "Income" "Age" "Advertising"
[6] "CompPrice" "Population"
Number of terminal nodes: 19
Residual mean deviance: 0.4282 = 77.51 / 181
Misclassification error rate: 0.105 = 21 / 200
# predicting the model
tree.pred <- predict(tree.carseats,Carseats.test,type="class")
# table for correct prediction
table(tree.pred,High.test)
High.test
tree.pred No Yes
No 86 27
Yes 30 57
# probability for correct prediction
(86+57)/200
[1] 0.715
# cross validation to check where to stop pruning
cv.carseats <- cv.tree(tree.carseats,FUN=prune.misclass)
names(cv.carseats)
[1] "size" "dev" "k" "method"
cv.carseats
$size
[1] 19 17 14 13 9 7 3 2 1
$dev
[1] 53 53 50 50 48 51 67 66 80
$k
[1] -Inf 0.0000000 0.6666667 1.0000000 1.7500000 2.0000000
[7] 4.2500000 5.0000000 23.0000000
$method
[1] "misclass"
attr(,"class")
[1] "prune" "tree.sequence"
par(mfrow=c(1,2))
plot(cv.carseats$size,cv.carseats$dev,type="b")
plot(cv.carseats$k,cv.carseats$dev,type="b")
# prune the tree
prune.carseats <- prune.misclass(tree.carseats,best=9)
plot(prune.carseats)
text(prune.carseats,pretty=0)
# predicting the model
tree.pred <- predict(prune.carseats,Carseats.test,type="class")
# table for correct prediction
table(tree.pred,High.test)
High.test
tree.pred No Yes
No 94 24
Yes 22 60
# probability for correct prediction
(94+60)/200
[1] 0.77