Regression Trees and Model Trees
Understanding regression trees and model trees
Example: Calculating SDR
# set up the data
tee <- c(1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 7, 7, 7)
at1 <- c(1, 1, 1, 2, 2, 3, 4, 5, 5)
at2 <- c(6, 6, 7, 7, 7, 7)
bt1 <- c(1, 1, 1, 2, 2, 3, 4)
bt2 <- c(5, 5, 6, 6, 7, 7, 7, 7)
# compute the SDR
sdr_a <- sd(tee) - (length(at1) / length(tee) * sd(at1) + length(at2) / length(tee) * sd(at2))
sdr_b <- sd(tee) - (length(bt1) / length(tee) * sd(bt1) + length(bt2) / length(tee) * sd(bt2))
# compare the SDR for each split
sdr_a
sdr_b
Exercise No 3: Estimating Wine Quality
Step 2: Exploring and preparing the data
wine <- read.csv("whitewines.csv")
# examine the wine data
str(wine)
# the distribution of quality ratings
hist(wine$quality)
# summary statistics of the wine data
summary(wine)
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]
Step 3: Training a model on the data
# regression tree using rpart
library(rpart)
m.rpart <- rpart(quality ~ ., data = wine_train)
# get basic information about the tree
m.rpart
# get more detailed information about the tree
summary(m.rpart)
#install.packages("rpart.plot")
# use the rpart.plot package to create a visualization
library(rpart.plot)
# a basic decision tree diagram
rpart.plot(m.rpart, digits = 3)
# a few adjustments to the diagram
rpart.plot(m.rpart, digits = 4, fallen.leaves = TRUE, type = 3, extra = 101)
LS0tCnRpdGxlOiAiUmVncmVzc2lvbiBUcmVlcyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCgoKIyMjIyBSZWdyZXNzaW9uIFRyZWVzIGFuZCBNb2RlbCBUcmVlcwoKIyMgVW5kZXJzdGFuZGluZyByZWdyZXNzaW9uIHRyZWVzIGFuZCBtb2RlbCB0cmVlcwoKIyMgRXhhbXBsZTogQ2FsY3VsYXRpbmcgU0RSCgpgYGB7cn0KIyBzZXQgdXAgdGhlIGRhdGEKdGVlIDwtIGMoMSwgMSwgMSwgMiwgMiwgMywgNCwgNSwgNSwgNiwgNiwgNywgNywgNywgNykKYXQxIDwtIGMoMSwgMSwgMSwgMiwgMiwgMywgNCwgNSwgNSkKYXQyIDwtIGMoNiwgNiwgNywgNywgNywgNykKYnQxIDwtIGMoMSwgMSwgMSwgMiwgMiwgMywgNCkKYnQyIDwtIGMoNSwgNSwgNiwgNiwgNywgNywgNywgNykKYGBgCgoKCgpgYGB7cn0KIyBjb21wdXRlIHRoZSBTRFIKc2RyX2EgPC0gc2QodGVlKSAtIChsZW5ndGgoYXQxKSAvIGxlbmd0aCh0ZWUpICogc2QoYXQxKSArIGxlbmd0aChhdDIpIC8gbGVuZ3RoKHRlZSkgKiBzZChhdDIpKQpzZHJfYiA8LSBzZCh0ZWUpIC0gKGxlbmd0aChidDEpIC8gbGVuZ3RoKHRlZSkgKiBzZChidDEpICsgbGVuZ3RoKGJ0MikgLyBsZW5ndGgodGVlKSAqIHNkKGJ0MikpCmBgYAoKCgpgYGB7cn0KIyBjb21wYXJlIHRoZSBTRFIgZm9yIGVhY2ggc3BsaXQKc2RyX2EKc2RyX2IKYGBgCgoKCiMjIEV4ZXJjaXNlIE5vIDM6IEVzdGltYXRpbmcgV2luZSBRdWFsaXR5CgoKIyMgU3RlcCAyOiBFeHBsb3JpbmcgYW5kIHByZXBhcmluZyB0aGUgZGF0YQoKYGBge3J9CndpbmUgPC0gcmVhZC5jc3YoIndoaXRld2luZXMuY3N2IikKYGBgCgoKCmBgYHtyfQojIGV4YW1pbmUgdGhlIHdpbmUgZGF0YQpzdHIod2luZSkKYGBgCgoKYGBge3J9CiMgdGhlIGRpc3RyaWJ1dGlvbiBvZiBxdWFsaXR5IHJhdGluZ3MKaGlzdCh3aW5lJHF1YWxpdHkpCmBgYAoKCmBgYHtyfQojIHN1bW1hcnkgc3RhdGlzdGljcyBvZiB0aGUgd2luZSBkYXRhCnN1bW1hcnkod2luZSkKYGBgCgoKCmBgYHtyfQp3aW5lX3RyYWluIDwtIHdpbmVbMTozNzUwLCBdCndpbmVfdGVzdCA8LSB3aW5lWzM3NTE6NDg5OCwgXQpgYGAKCgoKIyMgU3RlcCAzOiBUcmFpbmluZyBhIG1vZGVsIG9uIHRoZSBkYXRhCgpgYGB7cn0KIyByZWdyZXNzaW9uIHRyZWUgdXNpbmcgcnBhcnQKbGlicmFyeShycGFydCkKbS5ycGFydCA8LSBycGFydChxdWFsaXR5IH4gLiwgZGF0YSA9IHdpbmVfdHJhaW4pCmBgYAoKCmBgYHtyfQojIGdldCBiYXNpYyBpbmZvcm1hdGlvbiBhYm91dCB0aGUgdHJlZQptLnJwYXJ0CmBgYAoKCgpgYGB7cn0KIyBnZXQgbW9yZSBkZXRhaWxlZCBpbmZvcm1hdGlvbiBhYm91dCB0aGUgdHJlZQpzdW1tYXJ5KG0ucnBhcnQpCmBgYAoKCmBgYHtyfQojaW5zdGFsbC5wYWNrYWdlcygicnBhcnQucGxvdCIpCmBgYAoKCmBgYHtyfQojIHVzZSB0aGUgcnBhcnQucGxvdCBwYWNrYWdlIHRvIGNyZWF0ZSBhIHZpc3VhbGl6YXRpb24KbGlicmFyeShycGFydC5wbG90KQpgYGAKCgpgYGB7cn0KIyBhIGJhc2ljIGRlY2lzaW9uIHRyZWUgZGlhZ3JhbQpycGFydC5wbG90KG0ucnBhcnQsIGRpZ2l0cyA9IDMpCmBgYAoKCmBgYHtyfQojIGEgZmV3IGFkanVzdG1lbnRzIHRvIHRoZSBkaWFncmFtCnJwYXJ0LnBsb3QobS5ycGFydCwgZGlnaXRzID0gNCwgZmFsbGVuLmxlYXZlcyA9IFRSVUUsIHR5cGUgPSAzLCBleHRyYSA9IDEwMSkKYGBgCgoKIyMgU3RlcCA0OiBFdmFsdWF0ZSBtb2RlbCBwZXJmb3JtYW5jCgpgYGB7cn0KIyBnZW5lcmF0ZSBwcmVkaWN0aW9ucyBmb3IgdGhlIHRlc3RpbmcgZGF0YXNldApwLnJwYXJ0IDwtIHByZWRpY3QobS5ycGFydCwgd2luZV90ZXN0KQpgYGAKCgpgYGB7cn0KIyBjb21wYXJlIHRoZSBkaXN0cmlidXRpb24gb2YgcHJlZGljdGVkIHZhbHVlcyB2cy4gYWN0dWFsIHZhbHVlcwpzdW1tYXJ5KHAucnBhcnQpCnN1bW1hcnkod2luZV90ZXN0JHF1YWxpdHkpCmBgYAoKCmBgYHtyfQojIGNvbXBhcmUgdGhlIGNvcnJlbGF0aW9uCmNvcihwLnJwYXJ0LCB3aW5lX3Rlc3QkcXVhbGl0eSkKYGBgCgoKYGBge3J9CiMgZnVuY3Rpb24gdG8gY2FsY3VsYXRlIHRoZSBtZWFuIGFic29sdXRlIGVycm9yCk1BRSA8LSBmdW5jdGlvbihhY3R1YWwsIHByZWRpY3RlZCkgewogIG1lYW4oYWJzKGFjdHVhbCAtIHByZWRpY3RlZCkpICAKfQpgYGAKCgoKYGBge3J9CiMgbWVhbiBhYnNvbHV0ZSBlcnJvciBiZXR3ZWVuIHByZWRpY3RlZCBhbmQgYWN0dWFsIHZhbHVlcwpNQUUocC5ycGFydCwgd2luZV90ZXN0JHF1YWxpdHkpCmBgYAoKCmBgYHtyfQojIG1lYW4gYWJzb2x1dGUgZXJyb3IgYmV0d2VlbiBhY3R1YWwgdmFsdWVzIGFuZCBtZWFuIHZhbHVlCm1lYW4od2luZV90cmFpbiRxdWFsaXR5KSAjIHJlc3VsdCA9IDUuODcKTUFFKDUuODcsIHdpbmVfdGVzdCRxdWFsaXR5KQpgYGAKCgojIyBTdGVwIDU6IEltcHJvdmluZyBtb2RlbCBwZXJmb3JtYW5jZQoKYGBge3J9CiNpbnN0YWxsLnBhY2thZ2VzKCJwbHlyIikKI2luc3RhbGwucGFja2FnZXMoIkN1YmlzdCIpCmBgYAoKCmBgYHtyfQojIHRyYWluIGEgQ3ViaXN0IE1vZGVsIFRyZWUKbGlicmFyeShDdWJpc3QpCm0uY3ViaXN0IDwtIGN1YmlzdCh4ID0gd2luZV90cmFpblstMTJdLCB5ID0gd2luZV90cmFpbiRxdWFsaXR5KQpgYGAKCgpgYGB7cn0KIyBkaXNwbGF5IGJhc2ljIGluZm9ybWF0aW9uIGFib3V0IHRoZSBtb2RlbCB0cmVlCm0uY3ViaXN0CmBgYAoKCgpgYGB7cn0KIyBkaXNwbGF5IHRoZSB0cmVlIGl0c2VsZgpzdW1tYXJ5KG0uY3ViaXN0KQpgYGAKCgpgYGB7cn0KIyBnZW5lcmF0ZSBwcmVkaWN0aW9ucyBmb3IgdGhlIG1vZGVsCnAuY3ViaXN0IDwtIHByZWRpY3QobS5jdWJpc3QsIHdpbmVfdGVzdCkKYGBgCgoKYGBge3J9CiMgc3VtbWFyeSBzdGF0aXN0aWNzIGFib3V0IHRoZSBwcmVkaWN0aW9ucwpzdW1tYXJ5KHAuY3ViaXN0KQpgYGAKCgpgYGB7cn0KIyBjb3JyZWxhdGlvbiBiZXR3ZWVuIHRoZSBwcmVkaWN0ZWQgYW5kIHRydWUgdmFsdWVzCmNvcihwLmN1YmlzdCwgd2luZV90ZXN0JHF1YWxpdHkpCmBgYAoKCmBgYHtyfQojIG1lYW4gYWJzb2x1dGUgZXJyb3Igb2YgcHJlZGljdGVkIGFuZCB0cnVlIHZhbHVlcwojICh1c2VzIGEgY3VzdG9tIGZ1bmN0aW9uIGRlZmluZWQgYWJvdmUpCk1BRSh3aW5lX3Rlc3QkcXVhbGl0eSwgcC5jdWJpc3QpIApgYGAKCgoK