Decision trees

Decision Trees in R

#load required libraries â rpart for classification and regression trees
library(rpart)
#mlbench for Ionosphere dataset
library(mlbench)

## Warning: package 'mlbench' was built under R version 3.5.2

#load Ionosphere
data('Ionosphere')

Next we separate the data into training and test sets. We will use the former to build the model and the latter to test it. To do this, I use a simple scheme wherein I randomly select 80% of the data for the training set and assign the remainder to the test data set. This is easily done in a single R statement that invokes the uniform distribution (runif) and the vectorised function, ifelse. Before invoking runif, I set a seed integer to my favourite integer in order to ensure reproducibility of results.

#set seed to ensure reproducible results
set.seed(39)
#split into training and test sets 80%
Ionosphere[,'train'] <- ifelse(runif(nrow(Ionosphere))<0.8,1,0)
#separate training and test sets
trainset <- Ionosphere[Ionosphere$train==1,]
testset <- Ionosphere[Ionosphere$train==0,]

#get column index of train flag
trainColNum <- grep('train',names(trainset))
#remove train flag column from train and test sets
trainset <- trainset[,-trainColNum]
testset <- testset[,-trainColNum]

head(trainColNum,3)

## [1] 36

head(Ionosphere,3)

##   V1 V2      V3       V4      V5       V6       V7       V8      V9
## 1  1  0 0.99539 -0.05889 0.85243  0.02306  0.83398 -0.37708 1.00000
## 2  1  0 1.00000 -0.18829 0.93035 -0.36156 -0.10868 -0.93597 1.00000
## 3  1  0 1.00000 -0.03365 1.00000  0.00485  1.00000 -0.12062 0.88965
##        V10     V11      V12     V13      V14      V15      V16     V17
## 1  0.03760 0.85243 -0.17755 0.59755 -0.44945  0.60536 -0.38223 0.84356
## 2 -0.04549 0.50874 -0.67743 0.34432 -0.69707 -0.51685 -0.97515 0.05499
## 3  0.01198 0.73082  0.05346 0.85443  0.00827  0.54591  0.00299 0.83775
##        V18     V19      V20      V21      V22      V23      V24      V25
## 1 -0.38542 0.58212 -0.32192  0.56971 -0.29674  0.36946 -0.47357  0.56811
## 2 -0.62237 0.33109 -1.00000 -0.13151 -0.45300 -0.18056 -0.35734 -0.20332
## 3 -0.13644 0.75535 -0.08540  0.70887 -0.27502  0.43385 -0.12062  0.57528
##        V26      V27      V28      V29      V30      V31      V32      V33
## 1 -0.51171  0.41078 -0.46168  0.21266 -0.34090  0.42267 -0.54487  0.18641
## 2 -0.26569 -0.20468 -0.18401 -0.19040 -0.11593 -0.16626 -0.06288 -0.13738
## 3 -0.40220  0.58984 -0.22145  0.43100 -0.17365  0.60436 -0.24180  0.56045
##        V34 Class train
## 1 -0.45300  good     1
## 2 -0.02447   bad     1
## 3 -0.38238  good     1

head(trainset,3)

##   V1 V2      V3       V4      V5       V6       V7       V8      V9
## 1  1  0 0.99539 -0.05889 0.85243  0.02306  0.83398 -0.37708 1.00000
## 2  1  0 1.00000 -0.18829 0.93035 -0.36156 -0.10868 -0.93597 1.00000
## 3  1  0 1.00000 -0.03365 1.00000  0.00485  1.00000 -0.12062 0.88965
##        V10     V11      V12     V13      V14      V15      V16     V17
## 1  0.03760 0.85243 -0.17755 0.59755 -0.44945  0.60536 -0.38223 0.84356
## 2 -0.04549 0.50874 -0.67743 0.34432 -0.69707 -0.51685 -0.97515 0.05499
## 3  0.01198 0.73082  0.05346 0.85443  0.00827  0.54591  0.00299 0.83775
##        V18     V19      V20      V21      V22      V23      V24      V25
## 1 -0.38542 0.58212 -0.32192  0.56971 -0.29674  0.36946 -0.47357  0.56811
## 2 -0.62237 0.33109 -1.00000 -0.13151 -0.45300 -0.18056 -0.35734 -0.20332
## 3 -0.13644 0.75535 -0.08540  0.70887 -0.27502  0.43385 -0.12062  0.57528
##        V26      V27      V28      V29      V30      V31      V32      V33
## 1 -0.51171  0.41078 -0.46168  0.21266 -0.34090  0.42267 -0.54487  0.18641
## 2 -0.26569 -0.20468 -0.18401 -0.19040 -0.11593 -0.16626 -0.06288 -0.13738
## 3 -0.40220  0.58984 -0.22145  0.43100 -0.17365  0.60436 -0.24180  0.56045
##        V34 Class
## 1 -0.45300  good
## 2 -0.02447   bad
## 3 -0.38238  good

Next I invoke rpart. Note that we need to remove the predicted variable from the dataset before passing the latter on to the algorithm, which is why we need to find the column index of the predicted variable (first line below). Also note that we set the method parameter to ‘class’ which simply tells the algorithm that the predicted variable is discrete. Finally, rpart uses Gini rule for splitting by default, and we will stick with this option.

#get column index of predicted variable in dataset
typeColNum <- grep('Class',names(Ionosphere))
#build model method='class' for classification
rpart_model <- rpart(Class~.,data = trainset, method='class')
#plot decision tree
plot(rpart_model);text(rpart_model)

The resulting plot is quite self-explanatory . Next we see how good the model is by seeing how it fares against the test data.

#â¦and the moment of reckoning
#predict  - function  predict(object, model, filename="", fun=predict, ext=NULL, 
  # const=NULL, index=1, na.rm=TRUE, inf.rm=FALSE, factors=NULL, 
#   format, datatype, overwrite=FALSE, progress='', ...)
rpart_predict <- predict(rpart_model,testset[,-typeColNum],type='class')
mean(rpart_predict==testset$Class)

## [1] 0.9090909

#confusion matrix
table(pred=rpart_predict,true=testset$Class)

##       true
## pred   bad good
##   bad   18    0
##   good   6   42

Note that we need to verify the above results by doing multiple runs, each using different training and test sets.Will do it latter.

Prunning: Next, we prune the tree using the cost complexity criterion. Basically, the intent is to see if a shallower subtree can give us comparable results. If so, we will be better of choosing the shallower tree because it reduces the likelihood of overfitting.

As described earlier, we choose the appropriate pruning parameter (aka cost-complexity parameter) by picking the value that results in the lowest prediction error. Note that all relevant computations have already been carried out by R when we built the original tree (the call to rpart in the code above). All that remains now is to pick the value of :

Functio printcp() Displays the cp table for fitted itree object. Note that cp is not defined for method=“purity” or “extremes”. Otherwise identical to rpart’s printcp function.

#cost-complexity pruning
printcp(rpart_model)

## 
## Classification tree:
## rpart(formula = Class ~ ., data = trainset, method = "class")
## 
## Variables actually used in tree construction:
## [1] V1  V22 V27 V3  V5 
## 
## Root node error: 102/285 = 0.35789
## 
## n= 285 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.529412      0   1.00000 1.00000 0.079342
## 2 0.225490      1   0.47059 0.50980 0.063923
## 3 0.026144      2   0.24510 0.29412 0.050793
## 4 0.010000      5   0.16667 0.35294 0.054983

It is clear from the above, that the lowest cross-validation error (xerror in the table) occurs for =0.026 (this is CP in the table above). One can find CP programatically like so:

# get index of CP with lowest xerror
opt <- which.min(rpart_model$cptable[,"xerror"])
#get its value
cp <- rpart_model$cptable[opt, "CP"]

cp

## [1] 0.02614379

Next, we prune the tree based on this value of CP:

function prune(tree,cp,…) Cost-Complexity Pruning Of An Rpart Object Determines a nested sequence of subtrees of the supplied rpart object by recursively snipping off the least important splits, based on the complexity parameter (cp).

cp Complexity parameter to which the rpart object will be trimmed.

#prune tree
pruned_model <- prune(rpart_model,cp)
#plot tree
plot(pruned_model);text(pruned_model)

Note that rpart will use a default CP value of 0.01 if you do not specify one in prune.

COmpare fully grown tree with prunned tree

#find proportion of correct predictions using test set
rpart_pruned_predict <- predict(pruned_model,testset[,-typeColNum],type='class')
mean(rpart_pruned_predict==testset$Class)

## [1] 0.9090909

This seems like an improvement over the unpruned tree, but one swallow does not a summer make. We need to check that this holds up for different training and test sets. This is easily done by creating multiple random partitions of the dataset and checking the efficiancy of pruning for each. To do this efficiently, I will create a function that takes the training fraction, number of runs (partitions) and the name of the dataset as inputs and outputs the proportion of correct predictions for each run. It also optionally prunes the tree. Here is the code:

#function to do multiple runs
multiple_runs_classification <- function(train_fraction,n,dataset,prune_tree=FALSE){
fraction_correct <- rep(NA,n)


set.seed(39)


for (i in 1:n){
dataset[,'train'] <- ifelse(runif(nrow(dataset))<0.8,1,0)
trainColNum <- grep("train",names(dataset))
typeColNum <- grep("Class",names(dataset))
trainset <- dataset[dataset$train==1,-trainColNum]
testset <- dataset[dataset$train==0,-trainColNum]
rpart_model <- rpart(Class~.,data = trainset, method='class')


if(prune_tree==FALSE) {
rpart_test_predict <- predict(rpart_model,testset[,-typeColNum],type='class')
fraction_correct[i] <- mean(rpart_test_predict==testset$Class)
}else{
opt <- which.min(rpart_model$cptable[,'xerror'])
cp <- rpart_model$cptable[opt, 'CP']
pruned_model <- prune(rpart_model,cp)
rpart_pruned_predict <- predict(pruned_model,testset[,-typeColNum],type='class')
fraction_correct[i] <- mean(rpart_pruned_predict==testset$Class)
}
}
return(fraction_correct)
}

I have set the default value of the prune_tree to FALSE, so the function will execute the first branch of the if statement unless the default is overridden.

OK, so let’s do 50 runs with and without pruning, and check the mean and variance of the results for both sets of runs.

#50 runs, no pruning
unpruned_set <- multiple_runs_classification(0.8,50,Ionosphere)
mean(unpruned_set)

## [1] 0.8610148

#standard deviation
sd(unpruned_set)

## [1] 0.03616544

#50 runs, with pruning
pruned_set <- multiple_runs_classification(0.8,50,Ionosphere,prune_tree=TRUE)
mean(pruned_set)

## [1] 0.8863503

sd(pruned_set)

## [1] 0.03641121

So we see that there is an improvement of about 2% with pruning. Also, if you were to plot the trees as we did earlier, you would see that this improvement is achieved with shallower trees. Again, I point out that this is not always the case. In fact, it often happens that pruning results in worse predictions, albeit with better reliability a classic illustration of the bias-variance tradeoff.

Regression trees using rpart In the previous section we saw how one can build decision trees for situations in which the predicted variable is discrete. Let’s now look at the case in which the predicted variable is continuous. We will use the Boston Housing dataset from the mlbench package. Much of the discussion of the earlier section applies here, so I will just display the code, explaining only the differences.

Info on dataset is: https://www.rdocumentation.org/packages/mlbench/versions/2.1-1/topics/BostonHousing

#load Boston Housing dataset
data('BostonHousing')
#set seed to ensure reproducible results
set.seed(39)
#split into training and test sets
BostonHousing[,'train'] <- ifelse(runif(nrow(BostonHousing))<0.8,1,0)
#separate training and test sets
trainset <- BostonHousing[BostonHousing$train==1,]
testset <- BostonHousing[BostonHousing$train==0,]
#get column index of train flag
trainColNum <- grep("train",names(trainset))
#remove train flag column from train and test sets
trainset <- trainset[,-trainColNum]
testset <- testset[,-trainColNum]

quick check

head(testset,3)

##       crim zn indus chas   nox    rm  age    dis rad tax ptratio      b
## 16 0.62739  0  8.14    0 0.538 5.834 56.5 4.4986   4 307      21 395.62
## 19 0.80271  0  8.14    0 0.538 5.456 36.6 3.7965   4 307      21 288.99
## 22 0.85204  0  8.14    0 0.538 5.965 89.2 4.0123   4 307      21 392.53
##    lstat medv
## 16  8.47 19.9
## 19 11.69 20.2
## 22 13.83 19.6

Next we invoke rpart, noting that the predicted variable is medv (median value of owner-occupied homes in $1000 units) and that we need to set the method parameter to “ANOVA”. The latter tells rpart that the predicted variable is continuous (i.e that this is a regression problem).

#build model
#method="anova" for regression tree
rpart_model <- rpart(medv~.,data = trainset, method="anova")
#plot decision tree
plot(rpart_model);text(rpart_model)

resultColNum <- grep('medv', names(trainset))

Next, we need to see how good the predictions are. Since the dependent variable is continuous, we cannot compare the predictions directly against the test set. Instead, we calculate the root mean square (RMS) error. To do this, we request rpart to output the predictions as a vector one prediction per record in the test dataset. The RMS error can then easily be calculated by comparing this vector with the medv column in the test dataset.

Here is the relevant code:

#the moment of reckoning
rpart_test_predict <- predict(rpart_model,testset[,-resultColNum],type = "vector" )
#calculate RMS error
rmsqe <- sqrt(mean((rpart_test_predict-testset$medv)^2))
rmsqe

## [1] 4.256207

Again, we need to do multiple runs to check on the reliability of the predictions. However, you already know how to do that so I will leave it to you.

Moving on, we prune the tree using the cost complexity criterion as before. The code is exactly the same as in the classification problem.

printcp(rpart_model)

## 
## Regression tree:
## rpart(formula = medv ~ ., data = trainset, method = "anova")
## 
## Variables actually used in tree construction:
## [1] crim  dis   lstat rm   
## 
## Root node error: 34809/408 = 85.317
## 
## n= 408 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.446720      0   1.00000 1.00867 0.092952
## 2 0.169386      1   0.55328 0.67724 0.071299
## 3 0.065821      2   0.38389 0.47616 0.057804
## 4 0.041493      3   0.31807 0.39933 0.054937
## 5 0.024473      4   0.27658 0.36512 0.051627
## 6 0.022261      5   0.25211 0.32693 0.048690
## 7 0.016978      6   0.22985 0.31861 0.048524
## 8 0.010725      8   0.19589 0.31433 0.049010
## 9 0.010000      9   0.18516 0.29173 0.044341

# get index of CP with lowest xerror
opt <- which.min(rpart_model$cptable[,"xerror"])
#get its value
cp <- rpart_model$cptable[opt, "CP"]
cp

## [1] 0.01

#prune tree
pruned_model <- prune(rpart_model,cp=0.01)
#plot tree
plot(pruned_model);text(pruned_model)

library(tree)

## Warning: package 'tree' was built under R version 3.5.2

library(MASS)
library(ggplot2)
set.seed(39)
train = sample(1:nrow(Boston),nrow(Boston)/2) # 50/50 split between test and train
tree.boston = tree (medv ~ ., Boston, subset=train)
cv.boston = cv.tree(tree.boston)
plot(cv.boston$size, cv.boston$dev, type='b')

Results of CP near 7,8,9 are almost identical.The lowest MSE stands at tree size 7 or 8, so even if we prune at 7, it won’t make much difference.

The tree is unchanged so I won’t show it here. This means, as far as the cost complexity pruning is concerned, the optimal subtree is the same as the original tree. To confirm this, we need to do multiple runs as before something that I’ve already left as as an exercise for you :). Basically, you will need to write a function analogous to the one above, that computes the root mean square error instead of the proportion of correct predictions.

Decision trees work by splitting a dataset recursively. That is, subsets arising from a split are further split until a predetermined termination criterion is reached. At each step, a split is made based on the independent variable that results in the largest possible reduction in heterogeneity of the dependent variable.

The main drawback of decision trees is that they are prone to overfitting. The reason for this is that trees, if grown deep, are able to fit all kinds of variations in the data, including noise. Although it is possible to address this partially by pruning, the result often remains less than satisfactory. This is because the algorithm makes a locally optimal choice at each split without any regard to whether the choice made is the best one overall. A poor split made in the initial stages can thus doom the model, a problem that cannot be fixed by post-hoc pruning.

Ok, here is a multiple run regression function

#function to do multiple runs
multiple_runs_regression <- function(train_fraction,n,dataset,prune_tree=FALSE){
fraction_correct <- rep(NA,n)


set.seed(39)


for (i in 1:n){
dataset[,'train'] <- ifelse(runif(nrow(dataset))<0.8,1,0)
trainColNum <- grep("train",names(dataset))
resultColNum <- grep('medv', names(trainset))
trainset <- dataset[dataset$train==1,-trainColNum]
testset <- dataset[dataset$train==0,-trainColNum]
resultColNum <- grep('medv', names(trainset))
rpart_model <- rpart(medv~.,data = trainset, method='anova')


if(prune_tree==FALSE) {
rpart_test_predict <- predict(rpart_model,testset[,-resultColNum],type='vector')
fraction_correct[i] <- sqrt(mean((rpart_test_predict-testset$medv)^2))
}else{
  
  
opt <- which.min(rpart_model$cptable[,'xerror'])
cp <- rpart_model$cptable[opt, 'CP']
pruned_model <- prune(rpart_model,cp)
rpart_pruned_predict <- predict(pruned_model,testset[,-resultColNum],type='vector')
fraction_correct[i] <-sqrt(mean((rpart_pruned_predict==testset$medv)^2))
}
}
return(fraction_correct)
}

unpruned_set <- multiple_runs_regression(0.8,50,BostonHousing)
unpruned_set

##  [1] 4.256207 4.023745 4.937624 4.442867 3.949191 4.580405 4.231781
##  [8] 5.226838 4.551647 5.106434 4.331929 4.606016 5.144415 4.164032
## [15] 6.085005 5.368826 3.202384 5.823941 4.172432 5.455203 4.207505
## [22] 4.040981 4.524882 3.721925 4.484515 5.315103 4.073561 4.487564
## [29] 4.068936 4.316248 4.946322 6.555206 4.501731 4.908300 5.027597
## [36] 5.112158 3.790568 5.775800 3.698543 4.288646 4.324738 4.628913
## [43] 4.415374 3.693728 5.291071 3.774922 4.654453 5.401695 5.092927
## [50] 3.715561

#50 runs, no pruning
unpruned_set <- multiple_runs_regression(0.8,50,BostonHousing)
mean(unpruned_set)

## [1] 4.610008

#standard deviation
sd(unpruned_set)

## [1] 0.6871532

#50 runs, with pruning
pruned_set <- multiple_runs_regression(0.8,50,BostonHousing,prune_tree=TRUE)

pruned_set

##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [36] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

#mean(pruned_set)

sd(pruned_set)

## [1] 0

The cross-validation results suggests that the most complex tree is the best one. We can try pruning this tree to keep 5 terminal nodes.

prune.boston = prune.tree(tree.boston,best=5) # We keep 5 terminal nodes
plot(prune.boston)
text(prune.boston, pretty=0)

Finally, we evaluate the performance of the best tree based on cross-validation - the full tree with 8-terminal nodes

yhat = predict(tree.boston, newdata=Boston[-train,])
boston.test = Boston[-train,"medv"]
print(mean((yhat-boston.test)^2))

## [1] 27.2498

plot(yhat, boston.test)
abline(0,1)

The MSE associated with the tree is 27.24

Decision trees

olga

February 19, 2019