Regression Trees and Model Trees

Understanding regression trees and model trees

Example: Calculating SDR

# set up the data
tee <- c(1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 7, 7, 7)
at1 <- c(1, 1, 1, 2, 2, 3, 4, 5, 5)
at2 <- c(6, 6, 7, 7, 7, 7)
bt1 <- c(1, 1, 1, 2, 2, 3, 4)
bt2 <- c(5, 5, 6, 6, 7, 7, 7, 7)
# compute the SDR
sdr_a <- sd(tee) - (length(at1) / length(tee) * sd(at1) + length(at2) / length(tee) * sd(at2))
sdr_b <- sd(tee) - (length(bt1) / length(tee) * sd(bt1) + length(bt2) / length(tee) * sd(bt2))
# compare the SDR for each split
sdr_a
[1] 1.202815
sdr_b
[1] 1.392751

Exercise No 3: Estimating Wine Quality

Step 2: Exploring and preparing the data

wine <- read.csv("whitewines.csv")
# examine the wine data
str(wine)
'data.frame':   4898 obs. of  12 variables:
 $ fixed.acidity       : num  6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
 $ volatile.acidity    : num  0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
 $ citric.acid         : num  0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
 $ residual.sugar      : num  1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
 $ chlorides           : num  0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
 $ free.sulfur.dioxide : num  6 41 33 11 36 22 33 17 34 40 ...
 $ total.sulfur.dioxide: num  62 113 123 74 119 95 152 67 90 130 ...
 $ density             : num  0.993 0.999 0.995 0.991 0.993 ...
 $ pH                  : num  3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
 $ sulphates           : num  0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
 $ alcohol             : num  10.4 8.9 10.1 11.2 10.9 ...
 $ quality             : int  5 6 6 4 6 6 6 6 6 7 ...
# the distribution of quality ratings
hist(wine$quality)

# summary statistics of the wine data
summary(wine)
 fixed.acidity    volatile.acidity  citric.acid     residual.sugar     chlorides       free.sulfur.dioxide total.sulfur.dioxide
 Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600   Min.   :0.00900   Min.   :  2.00      Min.   :  9.0       
 1st Qu.: 6.300   1st Qu.:0.2100   1st Qu.:0.2700   1st Qu.: 1.700   1st Qu.:0.03600   1st Qu.: 23.00      1st Qu.:108.0       
 Median : 6.800   Median :0.2600   Median :0.3200   Median : 5.200   Median :0.04300   Median : 34.00      Median :134.0       
 Mean   : 6.855   Mean   :0.2782   Mean   :0.3342   Mean   : 6.391   Mean   :0.04577   Mean   : 35.31      Mean   :138.4       
 3rd Qu.: 7.300   3rd Qu.:0.3200   3rd Qu.:0.3900   3rd Qu.: 9.900   3rd Qu.:0.05000   3rd Qu.: 46.00      3rd Qu.:167.0       
 Max.   :14.200   Max.   :1.1000   Max.   :1.6600   Max.   :65.800   Max.   :0.34600   Max.   :289.00      Max.   :440.0       
    density             pH          sulphates         alcohol         quality     
 Min.   :0.9871   Min.   :2.720   Min.   :0.2200   Min.   : 8.00   Min.   :3.000  
 1st Qu.:0.9917   1st Qu.:3.090   1st Qu.:0.4100   1st Qu.: 9.50   1st Qu.:5.000  
 Median :0.9937   Median :3.180   Median :0.4700   Median :10.40   Median :6.000  
 Mean   :0.9940   Mean   :3.188   Mean   :0.4898   Mean   :10.51   Mean   :5.878  
 3rd Qu.:0.9961   3rd Qu.:3.280   3rd Qu.:0.5500   3rd Qu.:11.40   3rd Qu.:6.000  
 Max.   :1.0390   Max.   :3.820   Max.   :1.0800   Max.   :14.20   Max.   :9.000  
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]

Step 3: Training a model on the data

# regression tree using rpart
library(rpart)
m.rpart <- rpart(quality ~ ., data = wine_train)
# get basic information about the tree
m.rpart
# get more detailed information about the tree
summary(m.rpart)
#install.packages("rpart.plot")
# use the rpart.plot package to create a visualization
library(rpart.plot)
# a basic decision tree diagram
rpart.plot(m.rpart, digits = 3)
# a few adjustments to the diagram
rpart.plot(m.rpart, digits = 4, fallen.leaves = TRUE, type = 3, extra = 101)

Step 4: Evaluate model performanc

# generate predictions for the testing dataset
p.rpart <- predict(m.rpart, wine_test)
# compare the distribution of predicted values vs. actual values
summary(p.rpart)
summary(wine_test$quality)
# compare the correlation
cor(p.rpart, wine_test$quality)
# function to calculate the mean absolute error
MAE <- function(actual, predicted) {
  mean(abs(actual - predicted))  
}
# mean absolute error between predicted and actual values
MAE(p.rpart, wine_test$quality)
# mean absolute error between actual values and mean value
mean(wine_train$quality) # result = 5.87
MAE(5.87, wine_test$quality)

Step 5: Improving model performance

#install.packages("plyr")
#install.packages("Cubist")
# train a Cubist Model Tree
library(Cubist)
m.cubist <- cubist(x = wine_train[-12], y = wine_train$quality)
# display basic information about the model tree
m.cubist
# display the tree itself
summary(m.cubist)
# generate predictions for the model
p.cubist <- predict(m.cubist, wine_test)
# summary statistics about the predictions
summary(p.cubist)
# correlation between the predicted and true values
cor(p.cubist, wine_test$quality)
# mean absolute error of predicted and true values
# (uses a custom function defined above)
MAE(wine_test$quality, p.cubist) 
LS0tCnRpdGxlOiAiUmVncmVzc2lvbiBUcmVlcyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCgoKIyMjIyBSZWdyZXNzaW9uIFRyZWVzIGFuZCBNb2RlbCBUcmVlcwoKIyMgVW5kZXJzdGFuZGluZyByZWdyZXNzaW9uIHRyZWVzIGFuZCBtb2RlbCB0cmVlcwoKIyMgRXhhbXBsZTogQ2FsY3VsYXRpbmcgU0RSCgpgYGB7cn0KIyBzZXQgdXAgdGhlIGRhdGEKdGVlIDwtIGMoMSwgMSwgMSwgMiwgMiwgMywgNCwgNSwgNSwgNiwgNiwgNywgNywgNywgNykKYXQxIDwtIGMoMSwgMSwgMSwgMiwgMiwgMywgNCwgNSwgNSkKYXQyIDwtIGMoNiwgNiwgNywgNywgNywgNykKYnQxIDwtIGMoMSwgMSwgMSwgMiwgMiwgMywgNCkKYnQyIDwtIGMoNSwgNSwgNiwgNiwgNywgNywgNywgNykKYGBgCgoKCgpgYGB7cn0KIyBjb21wdXRlIHRoZSBTRFIKc2RyX2EgPC0gc2QodGVlKSAtIChsZW5ndGgoYXQxKSAvIGxlbmd0aCh0ZWUpICogc2QoYXQxKSArIGxlbmd0aChhdDIpIC8gbGVuZ3RoKHRlZSkgKiBzZChhdDIpKQpzZHJfYiA8LSBzZCh0ZWUpIC0gKGxlbmd0aChidDEpIC8gbGVuZ3RoKHRlZSkgKiBzZChidDEpICsgbGVuZ3RoKGJ0MikgLyBsZW5ndGgodGVlKSAqIHNkKGJ0MikpCmBgYAoKCgpgYGB7cn0KIyBjb21wYXJlIHRoZSBTRFIgZm9yIGVhY2ggc3BsaXQKc2RyX2EKc2RyX2IKYGBgCgoKCiMjIEV4ZXJjaXNlIE5vIDM6IEVzdGltYXRpbmcgV2luZSBRdWFsaXR5CgoKIyMgU3RlcCAyOiBFeHBsb3JpbmcgYW5kIHByZXBhcmluZyB0aGUgZGF0YQoKYGBge3J9CndpbmUgPC0gcmVhZC5jc3YoIndoaXRld2luZXMuY3N2IikKYGBgCgoKCmBgYHtyfQojIGV4YW1pbmUgdGhlIHdpbmUgZGF0YQpzdHIod2luZSkKYGBgCgoKYGBge3J9CiMgdGhlIGRpc3RyaWJ1dGlvbiBvZiBxdWFsaXR5IHJhdGluZ3MKaGlzdCh3aW5lJHF1YWxpdHkpCmBgYAoKCmBgYHtyfQojIHN1bW1hcnkgc3RhdGlzdGljcyBvZiB0aGUgd2luZSBkYXRhCnN1bW1hcnkod2luZSkKYGBgCgoKCmBgYHtyfQp3aW5lX3RyYWluIDwtIHdpbmVbMTozNzUwLCBdCndpbmVfdGVzdCA8LSB3aW5lWzM3NTE6NDg5OCwgXQpgYGAKCgoKIyMgU3RlcCAzOiBUcmFpbmluZyBhIG1vZGVsIG9uIHRoZSBkYXRhCgpgYGB7cn0KIyByZWdyZXNzaW9uIHRyZWUgdXNpbmcgcnBhcnQKbGlicmFyeShycGFydCkKbS5ycGFydCA8LSBycGFydChxdWFsaXR5IH4gLiwgZGF0YSA9IHdpbmVfdHJhaW4pCmBgYAoKCmBgYHtyfQojIGdldCBiYXNpYyBpbmZvcm1hdGlvbiBhYm91dCB0aGUgdHJlZQptLnJwYXJ0CmBgYAoKCgpgYGB7cn0KIyBnZXQgbW9yZSBkZXRhaWxlZCBpbmZvcm1hdGlvbiBhYm91dCB0aGUgdHJlZQpzdW1tYXJ5KG0ucnBhcnQpCmBgYAoKCmBgYHtyfQojaW5zdGFsbC5wYWNrYWdlcygicnBhcnQucGxvdCIpCmBgYAoKCmBgYHtyfQojIHVzZSB0aGUgcnBhcnQucGxvdCBwYWNrYWdlIHRvIGNyZWF0ZSBhIHZpc3VhbGl6YXRpb24KbGlicmFyeShycGFydC5wbG90KQpgYGAKCgpgYGB7cn0KIyBhIGJhc2ljIGRlY2lzaW9uIHRyZWUgZGlhZ3JhbQpycGFydC5wbG90KG0ucnBhcnQsIGRpZ2l0cyA9IDMpCmBgYAoKCmBgYHtyfQojIGEgZmV3IGFkanVzdG1lbnRzIHRvIHRoZSBkaWFncmFtCnJwYXJ0LnBsb3QobS5ycGFydCwgZGlnaXRzID0gNCwgZmFsbGVuLmxlYXZlcyA9IFRSVUUsIHR5cGUgPSAzLCBleHRyYSA9IDEwMSkKYGBgCgoKIyMgU3RlcCA0OiBFdmFsdWF0ZSBtb2RlbCBwZXJmb3JtYW5jCgpgYGB7cn0KIyBnZW5lcmF0ZSBwcmVkaWN0aW9ucyBmb3IgdGhlIHRlc3RpbmcgZGF0YXNldApwLnJwYXJ0IDwtIHByZWRpY3QobS5ycGFydCwgd2luZV90ZXN0KQpgYGAKCgpgYGB7cn0KIyBjb21wYXJlIHRoZSBkaXN0cmlidXRpb24gb2YgcHJlZGljdGVkIHZhbHVlcyB2cy4gYWN0dWFsIHZhbHVlcwpzdW1tYXJ5KHAucnBhcnQpCnN1bW1hcnkod2luZV90ZXN0JHF1YWxpdHkpCmBgYAoKCmBgYHtyfQojIGNvbXBhcmUgdGhlIGNvcnJlbGF0aW9uCmNvcihwLnJwYXJ0LCB3aW5lX3Rlc3QkcXVhbGl0eSkKYGBgCgoKYGBge3J9CiMgZnVuY3Rpb24gdG8gY2FsY3VsYXRlIHRoZSBtZWFuIGFic29sdXRlIGVycm9yCk1BRSA8LSBmdW5jdGlvbihhY3R1YWwsIHByZWRpY3RlZCkgewogIG1lYW4oYWJzKGFjdHVhbCAtIHByZWRpY3RlZCkpICAKfQpgYGAKCgoKYGBge3J9CiMgbWVhbiBhYnNvbHV0ZSBlcnJvciBiZXR3ZWVuIHByZWRpY3RlZCBhbmQgYWN0dWFsIHZhbHVlcwpNQUUocC5ycGFydCwgd2luZV90ZXN0JHF1YWxpdHkpCmBgYAoKCmBgYHtyfQojIG1lYW4gYWJzb2x1dGUgZXJyb3IgYmV0d2VlbiBhY3R1YWwgdmFsdWVzIGFuZCBtZWFuIHZhbHVlCm1lYW4od2luZV90cmFpbiRxdWFsaXR5KSAjIHJlc3VsdCA9IDUuODcKTUFFKDUuODcsIHdpbmVfdGVzdCRxdWFsaXR5KQpgYGAKCgojIyBTdGVwIDU6IEltcHJvdmluZyBtb2RlbCBwZXJmb3JtYW5jZQoKYGBge3J9CiNpbnN0YWxsLnBhY2thZ2VzKCJwbHlyIikKI2luc3RhbGwucGFja2FnZXMoIkN1YmlzdCIpCmBgYAoKCmBgYHtyfQojIHRyYWluIGEgQ3ViaXN0IE1vZGVsIFRyZWUKbGlicmFyeShDdWJpc3QpCm0uY3ViaXN0IDwtIGN1YmlzdCh4ID0gd2luZV90cmFpblstMTJdLCB5ID0gd2luZV90cmFpbiRxdWFsaXR5KQpgYGAKCgpgYGB7cn0KIyBkaXNwbGF5IGJhc2ljIGluZm9ybWF0aW9uIGFib3V0IHRoZSBtb2RlbCB0cmVlCm0uY3ViaXN0CmBgYAoKCgpgYGB7cn0KIyBkaXNwbGF5IHRoZSB0cmVlIGl0c2VsZgpzdW1tYXJ5KG0uY3ViaXN0KQpgYGAKCgpgYGB7cn0KIyBnZW5lcmF0ZSBwcmVkaWN0aW9ucyBmb3IgdGhlIG1vZGVsCnAuY3ViaXN0IDwtIHByZWRpY3QobS5jdWJpc3QsIHdpbmVfdGVzdCkKYGBgCgoKYGBge3J9CiMgc3VtbWFyeSBzdGF0aXN0aWNzIGFib3V0IHRoZSBwcmVkaWN0aW9ucwpzdW1tYXJ5KHAuY3ViaXN0KQpgYGAKCgpgYGB7cn0KIyBjb3JyZWxhdGlvbiBiZXR3ZWVuIHRoZSBwcmVkaWN0ZWQgYW5kIHRydWUgdmFsdWVzCmNvcihwLmN1YmlzdCwgd2luZV90ZXN0JHF1YWxpdHkpCmBgYAoKCmBgYHtyfQojIG1lYW4gYWJzb2x1dGUgZXJyb3Igb2YgcHJlZGljdGVkIGFuZCB0cnVlIHZhbHVlcwojICh1c2VzIGEgY3VzdG9tIGZ1bmN0aW9uIGRlZmluZWQgYWJvdmUpCk1BRSh3aW5lX3Rlc3QkcXVhbGl0eSwgcC5jdWJpc3QpIApgYGAKCgoK