ISLR: Chapter 8 Decision Trees

Sameer Mathur

Fitting Classification Trees

About the Dataset

Here we are using Carseats Dataset Which is the Inbuilt dataset in ISLR Package. We are also using Tree Package

Loading and Viewing the Dataset

# loading the package in library
library(tree)
library(ISLR)
# attaching the inbuilt dataset
attach(Carseats)
# dimentions of the dataset
dim(Carseats)
[1] 400  11
# column names of the data set
colnames(Carseats)
 [1] "Sales"       "CompPrice"   "Income"      "Advertising" "Population" 
 [6] "Price"       "ShelveLoc"   "Age"         "Education"   "Urban"      
[11] "US"         
# structure of the dataset
str(Carseats)
'data.frame':   400 obs. of  11 variables:
 $ Sales      : num  9.5 11.22 10.06 7.4 4.15 ...
 $ CompPrice  : num  138 111 113 117 141 124 115 136 132 132 ...
 $ Income     : num  73 48 35 100 64 113 105 81 110 113 ...
 $ Advertising: num  11 16 10 4 3 13 0 15 0 0 ...
 $ Population : num  276 260 269 466 340 501 45 425 108 131 ...
 $ Price      : num  120 83 80 97 128 72 108 120 124 124 ...
 $ ShelveLoc  : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
 $ Age        : num  42 65 59 55 38 78 71 67 76 76 ...
 $ Education  : num  17 10 12 14 13 16 15 10 10 17 ...
 $ Urban      : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
 $ US         : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...

Computing New variable

# creating a variable "HIGH" with following condistions
High <- ifelse(Sales <= 8,"No","Yes")
# creating a dataframe with new variable High
Carseats <- data.frame(Carseats,High)
# attaching the dataframe
attach(Carseats)
# checking the column names
colnames(Carseats)
 [1] "Sales"       "CompPrice"   "Income"      "Advertising" "Population" 
 [6] "Price"       "ShelveLoc"   "Age"         "Education"   "Urban"      
[11] "US"          "High"       
head(High)
[1] "Yes" "Yes" "Yes" "No"  "No"  "Yes"

Fitting Classification Trees

Fitting Classification Trees

# fitting decision tree
tree.carseats <- tree(High ~. -Sales, Carseats)
# summary of the model
summary(tree.carseats)

Classification tree:
tree(formula = High ~ . - Sales, data = Carseats)
Variables actually used in tree construction:
[1] "ShelveLoc"   "Price"       "Income"      "CompPrice"   "Population" 
[6] "Advertising" "Age"         "US"         
Number of terminal nodes:  27 
Residual mean deviance:  0.4575 = 170.7 / 373 
Misclassification error rate: 0.09 = 36 / 400 

plot of chunk unnamed-chunk-5

node), split, n, deviance, yval, (yprob)
      * denotes terminal node

  1) root 400 541.500 No ( 0.59000 0.41000 )  
    2) ShelveLoc: Bad,Medium 315 390.600 No ( 0.68889 0.31111 )  
      4) Price < 92.5 46  56.530 Yes ( 0.30435 0.69565 )  
        8) Income < 57 10  12.220 No ( 0.70000 0.30000 )  
         16) CompPrice < 110.5 5   0.000 No ( 1.00000 0.00000 ) *
         17) CompPrice > 110.5 5   6.730 Yes ( 0.40000 0.60000 ) *
        9) Income > 57 36  35.470 Yes ( 0.19444 0.80556 )  
         18) Population < 207.5 16  21.170 Yes ( 0.37500 0.62500 ) *
         19) Population > 207.5 20   7.941 Yes ( 0.05000 0.95000 ) *
      5) Price > 92.5 269 299.800 No ( 0.75465 0.24535 )  
       10) Advertising < 13.5 224 213.200 No ( 0.81696 0.18304 )  
         20) CompPrice < 124.5 96  44.890 No ( 0.93750 0.06250 )  
           40) Price < 106.5 38  33.150 No ( 0.84211 0.15789 )  
             80) Population < 177 12  16.300 No ( 0.58333 0.41667 )  
              160) Income < 60.5 6   0.000 No ( 1.00000 0.00000 ) *
              161) Income > 60.5 6   5.407 Yes ( 0.16667 0.83333 ) *
             81) Population > 177 26   8.477 No ( 0.96154 0.03846 ) *
           41) Price > 106.5 58   0.000 No ( 1.00000 0.00000 ) *
         21) CompPrice > 124.5 128 150.200 No ( 0.72656 0.27344 )  
           42) Price < 122.5 51  70.680 Yes ( 0.49020 0.50980 )  
             84) ShelveLoc: Bad 11   6.702 No ( 0.90909 0.09091 ) *
             85) ShelveLoc: Medium 40  52.930 Yes ( 0.37500 0.62500 )  
              170) Price < 109.5 16   7.481 Yes ( 0.06250 0.93750 ) *
              171) Price > 109.5 24  32.600 No ( 0.58333 0.41667 )  
                342) Age < 49.5 13  16.050 Yes ( 0.30769 0.69231 ) *
                343) Age > 49.5 11   6.702 No ( 0.90909 0.09091 ) *
           43) Price > 122.5 77  55.540 No ( 0.88312 0.11688 )  
             86) CompPrice < 147.5 58  17.400 No ( 0.96552 0.03448 ) *
             87) CompPrice > 147.5 19  25.010 No ( 0.63158 0.36842 )  
              174) Price < 147 12  16.300 Yes ( 0.41667 0.58333 )  
                348) CompPrice < 152.5 7   5.742 Yes ( 0.14286 0.85714 ) *
                349) CompPrice > 152.5 5   5.004 No ( 0.80000 0.20000 ) *
              175) Price > 147 7   0.000 No ( 1.00000 0.00000 ) *
       11) Advertising > 13.5 45  61.830 Yes ( 0.44444 0.55556 )  
         22) Age < 54.5 25  25.020 Yes ( 0.20000 0.80000 )  
           44) CompPrice < 130.5 14  18.250 Yes ( 0.35714 0.64286 )  
             88) Income < 100 9  12.370 No ( 0.55556 0.44444 ) *
             89) Income > 100 5   0.000 Yes ( 0.00000 1.00000 ) *
           45) CompPrice > 130.5 11   0.000 Yes ( 0.00000 1.00000 ) *
         23) Age > 54.5 20  22.490 No ( 0.75000 0.25000 )  
           46) CompPrice < 122.5 10   0.000 No ( 1.00000 0.00000 ) *
           47) CompPrice > 122.5 10  13.860 No ( 0.50000 0.50000 )  
             94) Price < 125 5   0.000 Yes ( 0.00000 1.00000 ) *
             95) Price > 125 5   0.000 No ( 1.00000 0.00000 ) *
    3) ShelveLoc: Good 85  90.330 Yes ( 0.22353 0.77647 )  
      6) Price < 135 68  49.260 Yes ( 0.11765 0.88235 )  
       12) US: No 17  22.070 Yes ( 0.35294 0.64706 )  
         24) Price < 109 8   0.000 Yes ( 0.00000 1.00000 ) *
         25) Price > 109 9  11.460 No ( 0.66667 0.33333 ) *
       13) US: Yes 51  16.880 Yes ( 0.03922 0.96078 ) *
      7) Price > 135 17  22.070 No ( 0.64706 0.35294 )  
       14) Income < 46 6   0.000 No ( 1.00000 0.00000 ) *
       15) Income > 46 11  15.160 Yes ( 0.45455 0.54545 ) *

Creating Training and Test set

# fixing the observations in training and test sets
set.seed(2)
# taking the sample of 200 data points from Carseats dataset to create traing set 
train <- sample(1:nrow(Carseats), 200)
# remaing data points are in test set
Carseats.test <- Carseats[-train,]
# taking test set of dependent variable ("High")
High.test <- High[-train]
# varifying the subsets
head(train)
[1]  74 281 229  67 374 373
head(High.test)
[1] "Yes" "Yes" "No"  "No"  "No"  "Yes"

Fitting the model for decision tree

# fitting the model for decision tree
tree.carseats <- tree(High~.-Sales,Carseats,subset = train)
# printing the summary
summary(tree.carseats)

Classification tree:
tree(formula = High ~ . - Sales, data = Carseats, subset = train)
Variables actually used in tree construction:
[1] "ShelveLoc"   "Price"       "Income"      "Age"         "Advertising"
[6] "CompPrice"   "Population" 
Number of terminal nodes:  19 
Residual mean deviance:  0.4282 = 77.51 / 181 
Misclassification error rate: 0.105 = 21 / 200 
# predicting the model
tree.pred <- predict(tree.carseats,Carseats.test,type="class")
# table for correct prediction
table(tree.pred,High.test)
         High.test
tree.pred No Yes
      No  86  27
      Yes 30  57
# probability for correct prediction
(86+57)/200
[1] 0.715
# cross validation to check where to stop pruning
cv.carseats <- cv.tree(tree.carseats,FUN=prune.misclass)
names(cv.carseats)
[1] "size"   "dev"    "k"      "method"
cv.carseats
$size
[1] 19 17 14 13  9  7  3  2  1

$dev
[1] 53 53 50 50 48 51 67 66 80

$k
[1]       -Inf  0.0000000  0.6666667  1.0000000  1.7500000  2.0000000
[7]  4.2500000  5.0000000 23.0000000

$method
[1] "misclass"

attr(,"class")
[1] "prune"         "tree.sequence"
par(mfrow=c(1,2))
plot(cv.carseats$size,cv.carseats$dev,type="b")

plot of chunk unnamed-chunk-10

plot(cv.carseats$k,cv.carseats$dev,type="b")

plot of chunk unnamed-chunk-11

# prune the tree
prune.carseats <- prune.misclass(tree.carseats,best=9)
plot(prune.carseats)
text(prune.carseats,pretty=0)

plot of chunk unnamed-chunk-12

# predicting the model
tree.pred <- predict(prune.carseats,Carseats.test,type="class")
# table for correct prediction
table(tree.pred,High.test)
         High.test
tree.pred No Yes
      No  94  24
      Yes 22  60
# probability for correct prediction
(94+60)/200
[1] 0.77