Regression Trees and Model Trees
Understanding regression trees and model trees
Example: Calculating SDR
# set up the data
tee <- c(1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 7, 7, 7)
at1 <- c(1, 1, 1, 2, 2, 3, 4, 5, 5)
at2 <- c(6, 6, 7, 7, 7, 7)
bt1 <- c(1, 1, 1, 2, 2, 3, 4)
bt2 <- c(5, 5, 6, 6, 7, 7, 7, 7)
getwd()
## [1] "/Users/joaovitorjeronimo/Downloads"
# compute the SDR
sdr_a <- sd(tee) - (length(at1) / length(tee) * sd(at1) + length(at2) / length(tee) * sd(at2))
sdr_b <- sd(tee) - (length(bt1) / length(tee) * sd(bt1) + length(bt2) / length(tee) * sd(bt2))
# compare the SDR for each split
sdr_a
## [1] 1.202815
sdr_b
## [1] 1.392751
Exercise No 3: Estimating Wine Quality
Step 2: Exploring and preparing the data
wine <- read.csv("whitewines.csv")
# examine the wine data
str(wine)
## 'data.frame': 4898 obs. of 1 variable:
## $ fixed.acidity.volatile.acidity.citric.acid.residual.sugar.chlorides.free.sulfur.dioxide.total.sulfur.dioxide.density.pH.sulphates.alcohol.quality: chr "7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6" "6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9.5;6" "8.1;0.28;0.4;6.9;0.05;30;97;0.9951;3.26;0.44;10.1;6" "7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4;9.9;6" ...
wine <- read.csv("whitewines.csv", sep=";")
str(wine)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
## $ volatile.acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
## $ citric.acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
## $ residual.sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
## $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
## $ free.sulfur.dioxide : num 45 14 30 47 47 30 30 45 14 28 ...
## $ total.sulfur.dioxide: num 170 132 97 186 186 97 136 170 132 129 ...
## $ density : num 1.001 0.994 0.995 0.996 0.996 ...
## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
## $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
## $ quality : int 6 6 6 6 6 6 6 6 6 6 ...
# the distribution of quality ratings
hist(wine$quality)

# summary statistics of the wine data
summary(wine)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700 1st Qu.: 1.700
## Median : 6.800 Median :0.2600 Median :0.3200 Median : 5.200
## Mean : 6.855 Mean :0.2782 Mean :0.3342 Mean : 6.391
## 3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3900 3rd Qu.: 9.900
## Max. :14.200 Max. :1.1000 Max. :1.6600 Max. :65.800
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.00900 Min. : 2.00 Min. : 9.0 Min. :0.9871
## 1st Qu.:0.03600 1st Qu.: 23.00 1st Qu.:108.0 1st Qu.:0.9917
## Median :0.04300 Median : 34.00 Median :134.0 Median :0.9937
## Mean :0.04577 Mean : 35.31 Mean :138.4 Mean :0.9940
## 3rd Qu.:0.05000 3rd Qu.: 46.00 3rd Qu.:167.0 3rd Qu.:0.9961
## Max. :0.34600 Max. :289.00 Max. :440.0 Max. :1.0390
## pH sulphates alcohol quality
## Min. :2.720 Min. :0.2200 Min. : 8.00 Min. :3.000
## 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.180 Median :0.4700 Median :10.40 Median :6.000
## Mean :3.188 Mean :0.4898 Mean :10.51 Mean :5.878
## 3rd Qu.:3.280 3rd Qu.:0.5500 3rd Qu.:11.40 3rd Qu.:6.000
## Max. :3.820 Max. :1.0800 Max. :14.20 Max. :9.000
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]
Step 3: Training a model on the data
# regression tree using rpart
library(rpart)
m.rpart <- rpart(quality ~ ., data = wine_train)
# get basic information about the tree
m.rpart
## n= 3750
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 3750 3140.06000 5.886933
## 2) alcohol< 10.85 2473 1510.66200 5.609381
## 4) volatile.acidity>=0.2425 1406 740.15080 5.402560
## 8) volatile.acidity>=0.4225 182 92.99451 4.994505 *
## 9) volatile.acidity< 0.4225 1224 612.34560 5.463235 *
## 5) volatile.acidity< 0.2425 1067 631.12090 5.881912 *
## 3) alcohol>=10.85 1277 1069.95800 6.424432
## 6) free.sulfur.dioxide< 11.5 93 99.18280 5.473118 *
## 7) free.sulfur.dioxide>=11.5 1184 879.99920 6.499155
## 14) alcohol< 11.85 611 447.38130 6.296236 *
## 15) alcohol>=11.85 573 380.63180 6.715532 *
# get more detailed information about the tree
summary(m.rpart)
## Call:
## rpart(formula = quality ~ ., data = wine_train)
## n= 3750
##
## CP nsplit rel error xerror xstd
## 1 0.17816211 0 1.0000000 1.0005343 0.02388706
## 2 0.04439109 1 0.8218379 0.8226024 0.02238696
## 3 0.02890893 2 0.7774468 0.7894902 0.02226523
## 4 0.01655575 3 0.7485379 0.7611952 0.02111412
## 5 0.01108600 4 0.7319821 0.7493066 0.02068952
## 6 0.01000000 5 0.7208961 0.7379990 0.02044947
##
## Variable importance
## alcohol density chlorides
## 38 23 12
## volatile.acidity total.sulfur.dioxide free.sulfur.dioxide
## 12 7 6
## sulphates pH residual.sugar
## 1 1 1
##
## Node number 1: 3750 observations, complexity param=0.1781621
## mean=5.886933, MSE=0.8373493
## left son=2 (2473 obs) right son=3 (1277 obs)
## Primary splits:
## alcohol < 10.85 to the left, improve=0.17816210, (0 missing)
## density < 0.992385 to the right, improve=0.11980970, (0 missing)
## chlorides < 0.0395 to the right, improve=0.08199995, (0 missing)
## total.sulfur.dioxide < 153.5 to the right, improve=0.03875440, (0 missing)
## free.sulfur.dioxide < 11.75 to the left, improve=0.03632119, (0 missing)
## Surrogate splits:
## density < 0.99201 to the right, agree=0.869, adj=0.614, (0 split)
## chlorides < 0.0375 to the right, agree=0.773, adj=0.334, (0 split)
## total.sulfur.dioxide < 102.5 to the right, agree=0.705, adj=0.132, (0 split)
## sulphates < 0.345 to the right, agree=0.670, adj=0.031, (0 split)
## fixed.acidity < 5.25 to the right, agree=0.662, adj=0.009, (0 split)
##
## Node number 2: 2473 observations, complexity param=0.04439109
## mean=5.609381, MSE=0.6108623
## left son=4 (1406 obs) right son=5 (1067 obs)
## Primary splits:
## volatile.acidity < 0.2425 to the right, improve=0.09227123, (0 missing)
## free.sulfur.dioxide < 13.5 to the left, improve=0.04177240, (0 missing)
## alcohol < 10.15 to the left, improve=0.03313802, (0 missing)
## citric.acid < 0.205 to the left, improve=0.02721200, (0 missing)
## pH < 3.325 to the left, improve=0.01860335, (0 missing)
## Surrogate splits:
## total.sulfur.dioxide < 111.5 to the right, agree=0.610, adj=0.097, (0 split)
## pH < 3.295 to the left, agree=0.598, adj=0.067, (0 split)
## alcohol < 10.05 to the left, agree=0.590, adj=0.049, (0 split)
## sulphates < 0.715 to the left, agree=0.584, adj=0.037, (0 split)
## residual.sugar < 1.85 to the right, agree=0.581, adj=0.029, (0 split)
##
## Node number 3: 1277 observations, complexity param=0.02890893
## mean=6.424432, MSE=0.8378682
## left son=6 (93 obs) right son=7 (1184 obs)
## Primary splits:
## free.sulfur.dioxide < 11.5 to the left, improve=0.08484051, (0 missing)
## alcohol < 11.85 to the left, improve=0.06149941, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.04259695, (0 missing)
## residual.sugar < 1.275 to the left, improve=0.02795662, (0 missing)
## total.sulfur.dioxide < 67.5 to the left, improve=0.02541719, (0 missing)
## Surrogate splits:
## total.sulfur.dioxide < 48.5 to the left, agree=0.937, adj=0.14, (0 split)
##
## Node number 4: 1406 observations, complexity param=0.011086
## mean=5.40256, MSE=0.526423
## left son=8 (182 obs) right son=9 (1224 obs)
## Primary splits:
## volatile.acidity < 0.4225 to the right, improve=0.04703189, (0 missing)
## free.sulfur.dioxide < 17.5 to the left, improve=0.04607770, (0 missing)
## total.sulfur.dioxide < 86.5 to the left, improve=0.02894310, (0 missing)
## alcohol < 10.25 to the left, improve=0.02890077, (0 missing)
## chlorides < 0.0455 to the right, improve=0.02096635, (0 missing)
## Surrogate splits:
## density < 0.99107 to the left, agree=0.874, adj=0.027, (0 split)
## citric.acid < 0.11 to the left, agree=0.873, adj=0.022, (0 split)
## fixed.acidity < 9.85 to the right, agree=0.873, adj=0.016, (0 split)
## chlorides < 0.206 to the right, agree=0.871, adj=0.005, (0 split)
##
## Node number 5: 1067 observations
## mean=5.881912, MSE=0.591491
##
## Node number 6: 93 observations
## mean=5.473118, MSE=1.066482
##
## Node number 7: 1184 observations, complexity param=0.01655575
## mean=6.499155, MSE=0.7432425
## left son=14 (611 obs) right son=15 (573 obs)
## Primary splits:
## alcohol < 11.85 to the left, improve=0.05907511, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.04400660, (0 missing)
## density < 0.991395 to the right, improve=0.02522410, (0 missing)
## residual.sugar < 1.225 to the left, improve=0.02503936, (0 missing)
## pH < 3.245 to the left, improve=0.02417936, (0 missing)
## Surrogate splits:
## density < 0.991115 to the right, agree=0.710, adj=0.401, (0 split)
## volatile.acidity < 0.2675 to the left, agree=0.665, adj=0.307, (0 split)
## chlorides < 0.0365 to the right, agree=0.631, adj=0.237, (0 split)
## total.sulfur.dioxide < 126.5 to the right, agree=0.566, adj=0.103, (0 split)
## residual.sugar < 1.525 to the left, agree=0.560, adj=0.091, (0 split)
##
## Node number 8: 182 observations
## mean=4.994505, MSE=0.5109588
##
## Node number 9: 1224 observations
## mean=5.463235, MSE=0.5002823
##
## Node number 14: 611 observations
## mean=6.296236, MSE=0.7322117
##
## Node number 15: 573 observations
## mean=6.715532, MSE=0.6642788
#install.packages("rpart.plot")
# use the rpart.plot package to create a visualization
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.3
# a basic decision tree diagram
rpart.plot(m.rpart, digits = 3)

# a few adjustments to the diagram
rpart.plot(m.rpart, digits = 4, fallen.leaves = TRUE, type = 3, extra = 101)

Step 5: Improving model performance
#install.packages("plyr")
#install.packages("Cubist")
# train a Cubist Model Tree
library(Cubist)
## Loading required package: lattice
m.cubist <- cubist(x = wine_train[-12], y = wine_train$quality)
# display basic information about the model tree
m.cubist
##
## Call:
## cubist.default(x = wine_train[-12], y = wine_train$quality)
##
## Number of samples: 3750
## Number of predictors: 11
##
## Number of committees: 1
## Number of rules: 10
# display the tree itself
summary(m.cubist)
##
## Call:
## cubist.default(x = wine_train[-12], y = wine_train$quality)
##
##
## Cubist [Release 2.07 GPL Edition] Mon Feb 23 20:47:58 2026
## ---------------------------------
##
## Target attribute `outcome'
##
## Read 3750 cases (12 attributes) from undefined.data
##
## Model:
##
## Rule 1: [918 cases, mean 5.3, range 3 to 7, est err 0.5]
##
## if
## volatile.acidity > 0.26
## alcohol <= 10.2
## then
## outcome = 66.6 + 0.187 alcohol + 0.041 residual.sugar - 65 density
## - 1.38 volatile.acidity + 0.5 pH + 0.0028 free.sulfur.dioxide
##
## Rule 2: [177 cases, mean 5.5, range 4 to 8, est err 0.5]
##
## if
## citric.acid > 0.42
## residual.sugar <= 14.05
## free.sulfur.dioxide > 49
## then
## outcome = 32.5 + 0.379 alcohol - 0.024 residual.sugar - 31 density
## - 0.54 volatile.acidity + 0.15 sulphates
## + 0.0003 total.sulfur.dioxide + 0.07 pH + 0.4 chlorides
## + 0.01 fixed.acidity
##
## Rule 3: [490 cases, mean 5.7, range 3 to 8, est err 0.5]
##
## if
## volatile.acidity <= 0.26
## residual.sugar <= 12.75
## free.sulfur.dioxide <= 49
## alcohol <= 10.2
## then
## outcome = 253.6 - 252 density + 0.102 residual.sugar
## - 2.63 volatile.acidity + 0.0149 free.sulfur.dioxide
## + 1.27 sulphates + 0.52 pH + 0.012 alcohol
##
## Rule 4: [71 cases, mean 5.8, range 5 to 7, est err 0.4]
##
## if
## fixed.acidity <= 7.5
## volatile.acidity <= 0.26
## residual.sugar > 14.05
## alcohol > 9.1
## then
## outcome = 127.2 - 125 density + 0.055 residual.sugar
## - 2.47 volatile.acidity + 0.24 fixed.acidity + 0.67 sulphates
## + 0.0017 total.sulfur.dioxide + 1.8 chlorides + 0.23 pH
## - 0.0015 free.sulfur.dioxide + 0.013 alcohol
##
## Rule 5: [446 cases, mean 5.8, range 3 to 9, est err 0.5]
##
## if
## citric.acid <= 0.42
## residual.sugar <= 14.05
## free.sulfur.dioxide > 49
## then
## outcome = 29.6 + 0.372 alcohol + 2.81 citric.acid
## - 2.94 volatile.acidity - 28 density + 0.013 residual.sugar
## + 0.13 sulphates + 0.0003 total.sulfur.dioxide
## + 0.01 fixed.acidity
##
## Rule 6: [451 cases, mean 5.9, range 3 to 8, est err 0.7]
##
## if
## free.sulfur.dioxide <= 20
## alcohol > 10.2
## then
## outcome = 16.2 + 0.0537 free.sulfur.dioxide + 0.311 alcohol
## - 2.63 volatile.acidity + 0.037 residual.sugar
## - 0.2 fixed.acidity - 13 density + 0.08 pH
##
## Rule 7: [113 cases, mean 5.9, range 5 to 7, est err 0.5]
##
## if
## fixed.acidity <= 7.5
## volatile.acidity <= 0.26
## residual.sugar > 14.05
## alcohol <= 9.1
## then
## outcome = -8.3 + 2.204 alcohol - 0.143 residual.sugar
## + 0.0066 total.sulfur.dioxide - 1.65 sulphates
## - 0.0092 free.sulfur.dioxide - 3 density
##
## Rule 8: [35 cases, mean 6.2, range 3 to 8, est err 0.8]
##
## if
## fixed.acidity > 7.5
## volatile.acidity <= 0.26
## residual.sugar > 14.05
## alcohol <= 10.2
## then
## outcome = 29.5 - 0.451 residual.sugar - 19.04 volatile.acidity
## - 0.804 alcohol - 39.4 chlorides + 0.0127 total.sulfur.dioxide
## - 0.64 fixed.acidity
##
## Rule 9: [46 cases, mean 6.3, range 5 to 7, est err 0.4]
##
## if
## volatile.acidity <= 0.26
## residual.sugar > 12.75
## residual.sugar <= 14.05
## free.sulfur.dioxide <= 49
## alcohol <= 10.2
## then
## outcome = 11.9 - 13.32 volatile.acidity + 0.0216 total.sulfur.dioxide
## - 8.01 sulphates - 0.0521 free.sulfur.dioxide - 16.2 chlorides
##
## Rule 10: [1410 cases, mean 6.4, range 3 to 9, est err 0.6]
##
## if
## free.sulfur.dioxide > 20
## alcohol > 10.2
## then
## outcome = 247.3 - 250 density + 0.11 residual.sugar + 1.26 pH
## + 0.116 alcohol + 1.04 sulphates + 0.11 fixed.acidity
## - 0.26 volatile.acidity + 0.0012 free.sulfur.dioxide
##
##
## Evaluation on training data (3750 cases):
##
## Average |error| 0.4
## Relative |error| 0.63
## Correlation coefficient 0.67
##
##
## Attribute usage:
## Conds Model
##
## 85% 99% alcohol
## 73% 84% free.sulfur.dioxide
## 40% 97% volatile.acidity
## 33% 99% residual.sugar
## 15% 11% citric.acid
## 5% 62% fixed.acidity
## 98% density
## 85% pH
## 66% sulphates
## 21% total.sulfur.dioxide
## 8% chlorides
##
##
## Time: 0.1 secs
# generate predictions for the model
p.cubist <- predict(m.cubist, wine_test)
# summary statistics about the predictions
summary(p.cubist)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.315 5.574 6.093 6.028 6.437 7.647
# correlation between the predicted and true values
cor(p.cubist, wine_test$quality)
## [1] 0.5683117
# mean absolute error of predicted and true values
# (uses a custom function defined above)
MAE(wine_test$quality, p.cubist)
## [1] 0.5306253