Decision Tree:

The basic idea is that if you have a bunch of variables that you want to use to predict an outcome, you can take each of those variables, and use it to split the outcome into different groups.

And, so as you split the outcomes into different groups, then you can evaluate the homogeneity of the outcome within each group.

And continue to split again if necessary, and then, until you get outcomes that are separated into groups that are homogeneous enough, or that they are small enough that you need to stop.

The pros of this approach are that it’s easy to interpret right, and you tend to get better performance in non-linear settings than you do with the linear regression models we talked about in the previous lectures.

The cons are that 1. without precluding or some kind of cross-validation, this can lead to overfitting. 2. And they can be harder to estimate uncertainty than it can be for the linear regression model setting. 3. In general the results may be variable depending on exact values of the parameters, or the variables that you collected.

data("iris")

names(iris) = tolower(names(iris))

table(iris$species)
## 
##     setosa versicolor  virginica 
##         50         50         50
suppressMessages(library(caret))

index = createDataPartition(y=iris$species, p=0.7, list=FALSE)

train.set = iris[index,]
test.set = iris[-index,]

dim(train.set)
## [1] 105   5
dim(test.set)
## [1] 45  5
with(iris, qplot(petal.width, sepal.width, colour=species, cex=2))

# fit the model
iris.tree = train(species ~ ., 
                  data=train.set, 
                  method="rpart", 
                  trControl = trainControl(method = "cv"))
## Loading required package: rpart
iris.tree
## CART 
## 
## 105 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 94, 96, 96, 94, 94, 94, ... 
## Resampling results across tuning parameters:
## 
##   cp         Accuracy   Kappa    
##   0.0000000  0.9505051  0.9254167
##   0.4428571  0.7383838  0.6132738
##   0.5000000  0.2969697  0.0000000
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.
summary(iris.tree$finalModel)
## Call:
## rpart(formula = .outcome ~ ., data = list(sepal.length = c(5.1, 
## 4.9, 4.6, 4.6, 5, 4.4, 4.9, 5.4, 4.3, 5.8, 5.1, 5.4, 5.1, 4.6, 
## 5.1, 4.8, 5.2, 5.2, 4.7, 4.8, 5.4, 5.2, 5.5, 4.9, 5, 5.5, 4.9, 
## 4.4, 5.1, 5, 4.4, 5, 5.1, 4.8, 5.1, 7, 6.9, 5.5, 6.5, 5.7, 6.3, 
## 4.9, 6.6, 5.2, 5, 6, 6.1, 6.7, 5.6, 5.8, 6.2, 5.9, 6.3, 6.1, 
## 6.4, 6.8, 6.7, 6, 5.5, 5.5, 5.8, 6, 6, 6.7, 6.3, 5.6, 5.5, 5, 
## 5.6, 5.7, 5.8, 7.1, 6.5, 4.9, 7.3, 6.7, 6.5, 6.4, 6.8, 5.7, 5.8, 
## 6.4, 6.5, 7.7, 7.7, 6.3, 6.7, 7.2, 6.2, 6.1, 7.4, 7.9, 6.3, 6.1, 
## 6.3, 6.4, 6.9, 5.8, 6.8, 6.7, 6.7, 6.3, 6.5, 6.2, 5.9), sepal.width = c(3.5, 
## 3, 3.1, 3.4, 3.4, 2.9, 3.1, 3.7, 3, 4, 3.5, 3.4, 3.7, 3.6, 3.3, 
## 3.4, 3.5, 3.4, 3.2, 3.1, 3.4, 4.1, 4.2, 3.1, 3.2, 3.5, 3.6, 3, 
## 3.4, 3.5, 3.2, 3.5, 3.8, 3, 3.8, 3.2, 3.1, 2.3, 2.8, 2.8, 3.3, 
## 2.4, 2.9, 2.7, 2, 2.2, 2.9, 3.1, 3, 2.7, 2.2, 3.2, 2.5, 2.8, 
## 2.9, 2.8, 3, 2.9, 2.4, 2.4, 2.7, 2.7, 3.4, 3.1, 2.3, 3, 2.6, 
## 2.3, 2.7, 3, 2.7, 3, 3, 2.5, 2.9, 2.5, 3.2, 2.7, 3, 2.5, 2.8, 
## 3.2, 3, 2.6, 2.8, 2.7, 3.3, 3.2, 2.8, 3, 2.8, 3.8, 2.8, 2.6, 
## 3.4, 3.1, 3.1, 2.7, 3.2, 3.3, 3, 2.5, 3, 3.4, 3), petal.length = c(1.4, 
## 1.4, 1.5, 1.4, 1.5, 1.4, 1.5, 1.5, 1.1, 1.2, 1.4, 1.7, 1.5, 1, 
## 1.7, 1.9, 1.5, 1.4, 1.6, 1.6, 1.5, 1.5, 1.4, 1.5, 1.2, 1.3, 1.4, 
## 1.3, 1.5, 1.3, 1.3, 1.6, 1.9, 1.4, 1.6, 4.7, 4.9, 4, 4.6, 4.5, 
## 4.7, 3.3, 4.6, 3.9, 3.5, 4, 4.7, 4.4, 4.5, 4.1, 4.5, 4.8, 4.9, 
## 4.7, 4.3, 4.8, 5, 4.5, 3.8, 3.7, 3.9, 5.1, 4.5, 4.7, 4.4, 4.1, 
## 4.4, 3.3, 4.2, 4.2, 5.1, 5.9, 5.8, 4.5, 6.3, 5.8, 5.1, 5.3, 5.5, 
## 5, 5.1, 5.3, 5.5, 6.9, 6.7, 4.9, 5.7, 6, 4.8, 4.9, 6.1, 6.4, 
## 5.1, 5.6, 5.6, 5.5, 5.4, 5.1, 5.9, 5.7, 5.2, 5, 5.2, 5.4, 5.1
## ), petal.width = c(0.2, 0.2, 0.2, 0.3, 0.2, 0.2, 0.1, 0.2, 0.1, 
## 0.2, 0.3, 0.2, 0.4, 0.2, 0.5, 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.1, 
## 0.2, 0.2, 0.2, 0.2, 0.1, 0.2, 0.2, 0.3, 0.2, 0.6, 0.4, 0.3, 0.2, 
## 1.4, 1.5, 1.3, 1.5, 1.3, 1.6, 1, 1.3, 1.4, 1, 1, 1.4, 1.4, 1.5, 
## 1, 1.5, 1.8, 1.5, 1.2, 1.3, 1.4, 1.7, 1.5, 1.1, 1, 1.2, 1.6, 
## 1.6, 1.5, 1.3, 1.3, 1.2, 1, 1.3, 1.2, 1.9, 2.1, 2.2, 1.7, 1.8, 
## 1.8, 2, 1.9, 2.1, 2, 2.4, 2.3, 1.8, 2.3, 2, 1.8, 2.1, 1.8, 1.8, 
## 1.8, 1.9, 2, 1.5, 1.4, 2.4, 1.8, 2.1, 1.9, 2.3, 2.5, 2.3, 1.9, 
## 2, 2.3, 1.8), .outcome = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
## 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
## 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
## 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 
## 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 
## 3, 3, 3, 3, 3, 3, 3, 3, 3)), control = list(minsplit = 20, minbucket = 7, 
##     cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2, 
##     surrogatestyle = 0, maxdepth = 30, xval = 0))
##   n= 105 
## 
##          CP nsplit  rel error
## 1 0.5000000      0 1.00000000
## 2 0.4428571      1 0.50000000
## 3 0.0000000      2 0.05714286
## 
## Variable importance
##  petal.width petal.length sepal.length  sepal.width 
##           34           31           20           14 
## 
## Node number 1: 105 observations,    complexity param=0.5
##   predicted class=setosa      expected loss=0.6666667  P(node) =1
##     class counts:    35    35    35
##    probabilities: 0.333 0.333 0.333 
##   left son=2 (35 obs) right son=3 (70 obs)
##   Primary splits:
##       petal.length < 2.6  to the left,  improve=35.00000, (0 missing)
##       petal.width  < 0.8  to the left,  improve=35.00000, (0 missing)
##       sepal.length < 5.45 to the left,  improve=24.39984, (0 missing)
##       sepal.width  < 3.35 to the right, improve=13.73418, (0 missing)
##   Surrogate splits:
##       petal.width  < 0.8  to the left,  agree=1.000, adj=1.000, (0 split)
##       sepal.length < 5.45 to the left,  agree=0.924, adj=0.771, (0 split)
##       sepal.width  < 3.35 to the right, agree=0.838, adj=0.514, (0 split)
## 
## Node number 2: 35 observations
##   predicted class=setosa      expected loss=0  P(node) =0.3333333
##     class counts:    35     0     0
##    probabilities: 1.000 0.000 0.000 
## 
## Node number 3: 70 observations,    complexity param=0.4428571
##   predicted class=versicolor  expected loss=0.5  P(node) =0.6666667
##     class counts:     0    35    35
##    probabilities: 0.000 0.500 0.500 
##   left son=6 (37 obs) right son=7 (33 obs)
##   Primary splits:
##       petal.width  < 1.75 to the left,  improve=27.547090, (0 missing)
##       petal.length < 4.85 to the left,  improve=24.107290, (0 missing)
##       sepal.length < 6.05 to the left,  improve= 5.996503, (0 missing)
##       sepal.width  < 2.45 to the left,  improve= 5.163934, (0 missing)
##   Surrogate splits:
##       petal.length < 4.75 to the left,  agree=0.900, adj=0.788, (0 split)
##       sepal.length < 6.15 to the left,  agree=0.700, adj=0.364, (0 split)
##       sepal.width  < 2.95 to the left,  agree=0.671, adj=0.303, (0 split)
## 
## Node number 6: 37 observations
##   predicted class=versicolor  expected loss=0.08108108  P(node) =0.352381
##     class counts:     0    34     3
##    probabilities: 0.000 0.919 0.081 
## 
## Node number 7: 33 observations
##   predicted class=virginica   expected loss=0.03030303  P(node) =0.3142857
##     class counts:     0     1    32
##    probabilities: 0.000 0.030 0.970
# plot the model
plot(iris.tree$finalModel, uniform=TRUE,
     main="Classification Tree")
text(iris.tree$finalModel, use.n.=TRUE, all=TRUE, cex=.8)

suppressMessages(library(rattle))

fancyRpartPlot(iris.tree$finalModel)

iris.pred = predict(iris.tree, newdata = test.set)
table(iris.pred, test.set$species)
##             
## iris.pred    setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         15         2
##   virginica       0          0        13
error.rate = round(mean(iris.pred != test.set$species),2)
error.rate
## [1] 0.04