Regression Trees and Model Trees
Understanding regression trees and model trees
Example: Calculating SDR
# set up the data
tee <- c(1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 7, 7, 7)
at1 <- c(1, 1, 1, 2, 2, 3, 4, 5, 5)
at2 <- c(6, 6, 7, 7, 7, 7)
bt1 <- c(1, 1, 1, 2, 2, 3, 4)
bt2 <- c(5, 5, 6, 6, 7, 7, 7, 7)
# compute the SDR
sdr_a <- sd(tee) - (length(at1) / length(tee) * sd(at1) + length(at2) / length(tee) * sd(at2))
sdr_b <- sd(tee) - (length(bt1) / length(tee) * sd(bt1) + length(bt2) / length(tee) * sd(bt2))
# compare the SDR for each split
sdr_a
[1] 1.202815
sdr_b
[1] 1.392751
Exercise No 3: Estimating Wine Quality
Step 2: Exploring and preparing the data
wine <- read.csv("whitewines.csv")
# examine the wine data
str(wine)
'data.frame': 4898 obs. of 12 variables:
$ fixed.acidity : num 6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
$ volatile.acidity : num 0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
$ citric.acid : num 0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
$ residual.sugar : num 1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
$ chlorides : num 0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
$ free.sulfur.dioxide : num 6 41 33 11 36 22 33 17 34 40 ...
$ total.sulfur.dioxide: num 62 113 123 74 119 95 152 67 90 130 ...
$ density : num 0.993 0.999 0.995 0.991 0.993 ...
$ pH : num 3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
$ sulphates : num 0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
$ alcohol : num 10.4 8.9 10.1 11.2 10.9 ...
$ quality : int 5 6 6 4 6 6 6 6 6 7 ...
# the distribution of quality ratings
hist(wine$quality)

# summary statistics of the wine data
summary(wine)
fixed.acidity volatile.acidity citric.acid residual.sugar chlorides free.sulfur.dioxide total.sulfur.dioxide
Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600 Min. :0.00900 Min. : 2.00 Min. : 9.0
1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700 1st Qu.: 1.700 1st Qu.:0.03600 1st Qu.: 23.00 1st Qu.:108.0
Median : 6.800 Median :0.2600 Median :0.3200 Median : 5.200 Median :0.04300 Median : 34.00 Median :134.0
Mean : 6.855 Mean :0.2782 Mean :0.3342 Mean : 6.391 Mean :0.04577 Mean : 35.31 Mean :138.4
3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3900 3rd Qu.: 9.900 3rd Qu.:0.05000 3rd Qu.: 46.00 3rd Qu.:167.0
Max. :14.200 Max. :1.1000 Max. :1.6600 Max. :65.800 Max. :0.34600 Max. :289.00 Max. :440.0
density pH sulphates alcohol quality
Min. :0.9871 Min. :2.720 Min. :0.2200 Min. : 8.00 Min. :3.000
1st Qu.:0.9917 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50 1st Qu.:5.000
Median :0.9937 Median :3.180 Median :0.4700 Median :10.40 Median :6.000
Mean :0.9940 Mean :3.188 Mean :0.4898 Mean :10.51 Mean :5.878
3rd Qu.:0.9961 3rd Qu.:3.280 3rd Qu.:0.5500 3rd Qu.:11.40 3rd Qu.:6.000
Max. :1.0390 Max. :3.820 Max. :1.0800 Max. :14.20 Max. :9.000
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]
Step 3: Training a model on the data
# regression tree using rpart
library(rpart)
m.rpart <- rpart(quality ~ ., data = wine_train)
# get basic information about the tree
m.rpart
# get more detailed information about the tree
summary(m.rpart)
#install.packages("rpart.plot")
# use the rpart.plot package to create a visualization
library(rpart.plot)
# a basic decision tree diagram
rpart.plot(m.rpart, digits = 3)
# a few adjustments to the diagram
rpart.plot(m.rpart, digits = 4, fallen.leaves = TRUE, type = 3, extra = 101)
LS0tCnRpdGxlOiAiUmVncmVzc2lvbiBUcmVlcyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCgoKIyMjIyBSZWdyZXNzaW9uIFRyZWVzIGFuZCBNb2RlbCBUcmVlcwoKIyMgVW5kZXJzdGFuZGluZyByZWdyZXNzaW9uIHRyZWVzIGFuZCBtb2RlbCB0cmVlcwoKIyMgRXhhbXBsZTogQ2FsY3VsYXRpbmcgU0RSCgpgYGB7cn0KIyBzZXQgdXAgdGhlIGRhdGEKdGVlIDwtIGMoMSwgMSwgMSwgMiwgMiwgMywgNCwgNSwgNSwgNiwgNiwgNywgNywgNywgNykKYXQxIDwtIGMoMSwgMSwgMSwgMiwgMiwgMywgNCwgNSwgNSkKYXQyIDwtIGMoNiwgNiwgNywgNywgNywgNykKYnQxIDwtIGMoMSwgMSwgMSwgMiwgMiwgMywgNCkKYnQyIDwtIGMoNSwgNSwgNiwgNiwgNywgNywgNywgNykKYGBgCgoKCgpgYGB7cn0KIyBjb21wdXRlIHRoZSBTRFIKc2RyX2EgPC0gc2QodGVlKSAtIChsZW5ndGgoYXQxKSAvIGxlbmd0aCh0ZWUpICogc2QoYXQxKSArIGxlbmd0aChhdDIpIC8gbGVuZ3RoKHRlZSkgKiBzZChhdDIpKQpzZHJfYiA8LSBzZCh0ZWUpIC0gKGxlbmd0aChidDEpIC8gbGVuZ3RoKHRlZSkgKiBzZChidDEpICsgbGVuZ3RoKGJ0MikgLyBsZW5ndGgodGVlKSAqIHNkKGJ0MikpCmBgYAoKCgpgYGB7cn0KIyBjb21wYXJlIHRoZSBTRFIgZm9yIGVhY2ggc3BsaXQKc2RyX2EKc2RyX2IKYGBgCgoKCiMjIEV4ZXJjaXNlIE5vIDM6IEVzdGltYXRpbmcgV2luZSBRdWFsaXR5CgoKIyMgU3RlcCAyOiBFeHBsb3JpbmcgYW5kIHByZXBhcmluZyB0aGUgZGF0YQoKYGBge3J9CndpbmUgPC0gcmVhZC5jc3YoIndoaXRld2luZXMuY3N2IikKYGBgCgoKCmBgYHtyfQojIGV4YW1pbmUgdGhlIHdpbmUgZGF0YQpzdHIod2luZSkKYGBgCgoKYGBge3J9CiMgdGhlIGRpc3RyaWJ1dGlvbiBvZiBxdWFsaXR5IHJhdGluZ3MKaGlzdCh3aW5lJHF1YWxpdHkpCmBgYAoKCmBgYHtyfQojIHN1bW1hcnkgc3RhdGlzdGljcyBvZiB0aGUgd2luZSBkYXRhCnN1bW1hcnkod2luZSkKYGBgCgoKCmBgYHtyfQp3aW5lX3RyYWluIDwtIHdpbmVbMTozNzUwLCBdCndpbmVfdGVzdCA8LSB3aW5lWzM3NTE6NDg5OCwgXQpgYGAKCgoKIyMgU3RlcCAzOiBUcmFpbmluZyBhIG1vZGVsIG9uIHRoZSBkYXRhCgpgYGB7cn0KIyByZWdyZXNzaW9uIHRyZWUgdXNpbmcgcnBhcnQKbGlicmFyeShycGFydCkKbS5ycGFydCA8LSBycGFydChxdWFsaXR5IH4gLiwgZGF0YSA9IHdpbmVfdHJhaW4pCmBgYAoKCmBgYHtyfQojIGdldCBiYXNpYyBpbmZvcm1hdGlvbiBhYm91dCB0aGUgdHJlZQptLnJwYXJ0CmBgYAoKCgpgYGB7cn0KIyBnZXQgbW9yZSBkZXRhaWxlZCBpbmZvcm1hdGlvbiBhYm91dCB0aGUgdHJlZQpzdW1tYXJ5KG0ucnBhcnQpCmBgYAoKCmBgYHtyfQojaW5zdGFsbC5wYWNrYWdlcygicnBhcnQucGxvdCIpCmBgYAoKCmBgYHtyfQojIHVzZSB0aGUgcnBhcnQucGxvdCBwYWNrYWdlIHRvIGNyZWF0ZSBhIHZpc3VhbGl6YXRpb24KbGlicmFyeShycGFydC5wbG90KQpgYGAKCgpgYGB7cn0KIyBhIGJhc2ljIGRlY2lzaW9uIHRyZWUgZGlhZ3JhbQpycGFydC5wbG90KG0ucnBhcnQsIGRpZ2l0cyA9IDMpCmBgYAoKCmBgYHtyfQojIGEgZmV3IGFkanVzdG1lbnRzIHRvIHRoZSBkaWFncmFtCnJwYXJ0LnBsb3QobS5ycGFydCwgZGlnaXRzID0gNCwgZmFsbGVuLmxlYXZlcyA9IFRSVUUsIHR5cGUgPSAzLCBleHRyYSA9IDEwMSkKYGBgCgoKIyMgU3RlcCA0OiBFdmFsdWF0ZSBtb2RlbCBwZXJmb3JtYW5jCgpgYGB7cn0KIyBnZW5lcmF0ZSBwcmVkaWN0aW9ucyBmb3IgdGhlIHRlc3RpbmcgZGF0YXNldApwLnJwYXJ0IDwtIHByZWRpY3QobS5ycGFydCwgd2luZV90ZXN0KQpgYGAKCgpgYGB7cn0KIyBjb21wYXJlIHRoZSBkaXN0cmlidXRpb24gb2YgcHJlZGljdGVkIHZhbHVlcyB2cy4gYWN0dWFsIHZhbHVlcwpzdW1tYXJ5KHAucnBhcnQpCnN1bW1hcnkod2luZV90ZXN0JHF1YWxpdHkpCmBgYAoKCmBgYHtyfQojIGNvbXBhcmUgdGhlIGNvcnJlbGF0aW9uCmNvcihwLnJwYXJ0LCB3aW5lX3Rlc3QkcXVhbGl0eSkKYGBgCgoKYGBge3J9CiMgZnVuY3Rpb24gdG8gY2FsY3VsYXRlIHRoZSBtZWFuIGFic29sdXRlIGVycm9yCk1BRSA8LSBmdW5jdGlvbihhY3R1YWwsIHByZWRpY3RlZCkgewogIG1lYW4oYWJzKGFjdHVhbCAtIHByZWRpY3RlZCkpICAKfQpgYGAKCgoKYGBge3J9CiMgbWVhbiBhYnNvbHV0ZSBlcnJvciBiZXR3ZWVuIHByZWRpY3RlZCBhbmQgYWN0dWFsIHZhbHVlcwpNQUUocC5ycGFydCwgd2luZV90ZXN0JHF1YWxpdHkpCmBgYAoKCmBgYHtyfQojIG1lYW4gYWJzb2x1dGUgZXJyb3IgYmV0d2VlbiBhY3R1YWwgdmFsdWVzIGFuZCBtZWFuIHZhbHVlCm1lYW4od2luZV90cmFpbiRxdWFsaXR5KSAjIHJlc3VsdCA9IDUuODcKTUFFKDUuODcsIHdpbmVfdGVzdCRxdWFsaXR5KQpgYGAKCgojIyBTdGVwIDU6IEltcHJvdmluZyBtb2RlbCBwZXJmb3JtYW5jZQoKYGBge3J9CiNpbnN0YWxsLnBhY2thZ2VzKCJwbHlyIikKI2luc3RhbGwucGFja2FnZXMoIkN1YmlzdCIpCmBgYAoKCmBgYHtyfQojIHRyYWluIGEgQ3ViaXN0IE1vZGVsIFRyZWUKbGlicmFyeShDdWJpc3QpCm0uY3ViaXN0IDwtIGN1YmlzdCh4ID0gd2luZV90cmFpblstMTJdLCB5ID0gd2luZV90cmFpbiRxdWFsaXR5KQpgYGAKCgpgYGB7cn0KIyBkaXNwbGF5IGJhc2ljIGluZm9ybWF0aW9uIGFib3V0IHRoZSBtb2RlbCB0cmVlCm0uY3ViaXN0CmBgYAoKCgpgYGB7cn0KIyBkaXNwbGF5IHRoZSB0cmVlIGl0c2VsZgpzdW1tYXJ5KG0uY3ViaXN0KQpgYGAKCgpgYGB7cn0KIyBnZW5lcmF0ZSBwcmVkaWN0aW9ucyBmb3IgdGhlIG1vZGVsCnAuY3ViaXN0IDwtIHByZWRpY3QobS5jdWJpc3QsIHdpbmVfdGVzdCkKYGBgCgoKYGBge3J9CiMgc3VtbWFyeSBzdGF0aXN0aWNzIGFib3V0IHRoZSBwcmVkaWN0aW9ucwpzdW1tYXJ5KHAuY3ViaXN0KQpgYGAKCgpgYGB7cn0KIyBjb3JyZWxhdGlvbiBiZXR3ZWVuIHRoZSBwcmVkaWN0ZWQgYW5kIHRydWUgdmFsdWVzCmNvcihwLmN1YmlzdCwgd2luZV90ZXN0JHF1YWxpdHkpCmBgYAoKCmBgYHtyfQojIG1lYW4gYWJzb2x1dGUgZXJyb3Igb2YgcHJlZGljdGVkIGFuZCB0cnVlIHZhbHVlcwojICh1c2VzIGEgY3VzdG9tIGZ1bmN0aW9uIGRlZmluZWQgYWJvdmUpCk1BRSh3aW5lX3Rlc3QkcXVhbGl0eSwgcC5jdWJpc3QpIApgYGAKCgoK