whitewine <- read.csv("winequality-white.csv",sep = ";",
                 stringsAsFactors = TRUE)
str(whitewine)
## 'data.frame':    4898 obs. of  12 variables:
##  $ fixed.acidity       : num  7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
##  $ volatile.acidity    : num  0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
##  $ citric.acid         : num  0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
##  $ residual.sugar      : num  20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
##  $ chlorides           : num  0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
##  $ free.sulfur.dioxide : num  45 14 30 47 47 30 30 45 14 28 ...
##  $ total.sulfur.dioxide: num  170 132 97 186 186 97 136 170 132 129 ...
##  $ density             : num  1.001 0.994 0.995 0.996 0.996 ...
##  $ pH                  : num  3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
##  $ sulphates           : num  0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
##  $ alcohol             : num  8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
##  $ quality             : int  6 6 6 6 6 6 6 6 6 6 ...
# the distribution of quality ratings
hist(whitewine$quality)

summary(whitewine)
##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.300   1st Qu.:0.2100   1st Qu.:0.2700   1st Qu.: 1.700  
##  Median : 6.800   Median :0.2600   Median :0.3200   Median : 5.200  
##  Mean   : 6.855   Mean   :0.2782   Mean   :0.3342   Mean   : 6.391  
##  3rd Qu.: 7.300   3rd Qu.:0.3200   3rd Qu.:0.3900   3rd Qu.: 9.900  
##  Max.   :14.200   Max.   :1.1000   Max.   :1.6600   Max.   :65.800  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.00900   Min.   :  2.00      Min.   :  9.0        Min.   :0.9871  
##  1st Qu.:0.03600   1st Qu.: 23.00      1st Qu.:108.0        1st Qu.:0.9917  
##  Median :0.04300   Median : 34.00      Median :134.0        Median :0.9937  
##  Mean   :0.04577   Mean   : 35.31      Mean   :138.4        Mean   :0.9940  
##  3rd Qu.:0.05000   3rd Qu.: 46.00      3rd Qu.:167.0        3rd Qu.:0.9961  
##  Max.   :0.34600   Max.   :289.00      Max.   :440.0        Max.   :1.0390  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.720   Min.   :0.2200   Min.   : 8.00   Min.   :3.000  
##  1st Qu.:3.090   1st Qu.:0.4100   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.180   Median :0.4700   Median :10.40   Median :6.000  
##  Mean   :3.188   Mean   :0.4898   Mean   :10.51   Mean   :5.878  
##  3rd Qu.:3.280   3rd Qu.:0.5500   3rd Qu.:11.40   3rd Qu.:6.000  
##  Max.   :3.820   Max.   :1.0800   Max.   :14.20   Max.   :9.000
whitewine_train <- whitewine[1:3750, ]
whitewine_test <- whitewine[3751:4898, ]

Training a model on the data

library(rpart)
m.rpart <- rpart(quality ~ ., data = whitewine_train)
# get basic information about the tree
m.rpart
## n= 3750 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 3750 3140.06000 5.886933  
##    2) alcohol< 10.85 2473 1510.66200 5.609381  
##      4) volatile.acidity>=0.2425 1406  740.15080 5.402560  
##        8) volatile.acidity>=0.4225 182   92.99451 4.994505 *
##        9) volatile.acidity< 0.4225 1224  612.34560 5.463235 *
##      5) volatile.acidity< 0.2425 1067  631.12090 5.881912 *
##    3) alcohol>=10.85 1277 1069.95800 6.424432  
##      6) free.sulfur.dioxide< 11.5 93   99.18280 5.473118 *
##      7) free.sulfur.dioxide>=11.5 1184  879.99920 6.499155  
##       14) alcohol< 11.85 611  447.38130 6.296236 *
##       15) alcohol>=11.85 573  380.63180 6.715532 *
# get more detailed information about the tree
summary(m.rpart)
## Call:
## rpart(formula = quality ~ ., data = whitewine_train)
##   n= 3750 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.17816211      0 1.0000000 1.0012045 0.02391262
## 2 0.04439109      1 0.8218379 0.8234799 0.02239999
## 3 0.02890893      2 0.7774468 0.7861089 0.02209662
## 4 0.01655575      3 0.7485379 0.7576577 0.02093863
## 5 0.01108600      4 0.7319821 0.7427597 0.02047926
## 6 0.01000000      5 0.7208961 0.7354229 0.02036136
## 
## Variable importance
##              alcohol              density            chlorides 
##                   38                   23                   12 
##     volatile.acidity total.sulfur.dioxide  free.sulfur.dioxide 
##                   12                    7                    6 
##            sulphates                   pH       residual.sugar 
##                    1                    1                    1 
## 
## Node number 1: 3750 observations,    complexity param=0.1781621
##   mean=5.886933, MSE=0.8373493 
##   left son=2 (2473 obs) right son=3 (1277 obs)
##   Primary splits:
##       alcohol              < 10.85    to the left,  improve=0.17816210, (0 missing)
##       density              < 0.992385 to the right, improve=0.11980970, (0 missing)
##       chlorides            < 0.0395   to the right, improve=0.08199995, (0 missing)
##       total.sulfur.dioxide < 153.5    to the right, improve=0.03875440, (0 missing)
##       free.sulfur.dioxide  < 11.75    to the left,  improve=0.03632119, (0 missing)
##   Surrogate splits:
##       density              < 0.99201  to the right, agree=0.869, adj=0.614, (0 split)
##       chlorides            < 0.0375   to the right, agree=0.773, adj=0.334, (0 split)
##       total.sulfur.dioxide < 102.5    to the right, agree=0.705, adj=0.132, (0 split)
##       sulphates            < 0.345    to the right, agree=0.670, adj=0.031, (0 split)
##       fixed.acidity        < 5.25     to the right, agree=0.662, adj=0.009, (0 split)
## 
## Node number 2: 2473 observations,    complexity param=0.04439109
##   mean=5.609381, MSE=0.6108623 
##   left son=4 (1406 obs) right son=5 (1067 obs)
##   Primary splits:
##       volatile.acidity    < 0.2425   to the right, improve=0.09227123, (0 missing)
##       free.sulfur.dioxide < 13.5     to the left,  improve=0.04177240, (0 missing)
##       alcohol             < 10.15    to the left,  improve=0.03313802, (0 missing)
##       citric.acid         < 0.205    to the left,  improve=0.02721200, (0 missing)
##       pH                  < 3.325    to the left,  improve=0.01860335, (0 missing)
##   Surrogate splits:
##       total.sulfur.dioxide < 111.5    to the right, agree=0.610, adj=0.097, (0 split)
##       pH                   < 3.295    to the left,  agree=0.598, adj=0.067, (0 split)
##       alcohol              < 10.05    to the left,  agree=0.590, adj=0.049, (0 split)
##       sulphates            < 0.715    to the left,  agree=0.584, adj=0.037, (0 split)
##       residual.sugar       < 1.85     to the right, agree=0.581, adj=0.029, (0 split)
## 
## Node number 3: 1277 observations,    complexity param=0.02890893
##   mean=6.424432, MSE=0.8378682 
##   left son=6 (93 obs) right son=7 (1184 obs)
##   Primary splits:
##       free.sulfur.dioxide  < 11.5     to the left,  improve=0.08484051, (0 missing)
##       alcohol              < 11.85    to the left,  improve=0.06149941, (0 missing)
##       fixed.acidity        < 7.35     to the right, improve=0.04259695, (0 missing)
##       residual.sugar       < 1.275    to the left,  improve=0.02795662, (0 missing)
##       total.sulfur.dioxide < 67.5     to the left,  improve=0.02541719, (0 missing)
##   Surrogate splits:
##       total.sulfur.dioxide < 48.5     to the left,  agree=0.937, adj=0.14, (0 split)
## 
## Node number 4: 1406 observations,    complexity param=0.011086
##   mean=5.40256, MSE=0.526423 
##   left son=8 (182 obs) right son=9 (1224 obs)
##   Primary splits:
##       volatile.acidity     < 0.4225   to the right, improve=0.04703189, (0 missing)
##       free.sulfur.dioxide  < 17.5     to the left,  improve=0.04607770, (0 missing)
##       total.sulfur.dioxide < 86.5     to the left,  improve=0.02894310, (0 missing)
##       alcohol              < 10.25    to the left,  improve=0.02890077, (0 missing)
##       chlorides            < 0.0455   to the right, improve=0.02096635, (0 missing)
##   Surrogate splits:
##       density       < 0.99107  to the left,  agree=0.874, adj=0.027, (0 split)
##       citric.acid   < 0.11     to the left,  agree=0.873, adj=0.022, (0 split)
##       fixed.acidity < 9.85     to the right, agree=0.873, adj=0.016, (0 split)
##       chlorides     < 0.206    to the right, agree=0.871, adj=0.005, (0 split)
## 
## Node number 5: 1067 observations
##   mean=5.881912, MSE=0.591491 
## 
## Node number 6: 93 observations
##   mean=5.473118, MSE=1.066482 
## 
## Node number 7: 1184 observations,    complexity param=0.01655575
##   mean=6.499155, MSE=0.7432425 
##   left son=14 (611 obs) right son=15 (573 obs)
##   Primary splits:
##       alcohol        < 11.85    to the left,  improve=0.05907511, (0 missing)
##       fixed.acidity  < 7.35     to the right, improve=0.04400660, (0 missing)
##       density        < 0.991395 to the right, improve=0.02522410, (0 missing)
##       residual.sugar < 1.225    to the left,  improve=0.02503936, (0 missing)
##       pH             < 3.245    to the left,  improve=0.02417936, (0 missing)
##   Surrogate splits:
##       density              < 0.991115 to the right, agree=0.710, adj=0.401, (0 split)
##       volatile.acidity     < 0.2675   to the left,  agree=0.665, adj=0.307, (0 split)
##       chlorides            < 0.0365   to the right, agree=0.631, adj=0.237, (0 split)
##       total.sulfur.dioxide < 126.5    to the right, agree=0.566, adj=0.103, (0 split)
##       residual.sugar       < 1.525    to the left,  agree=0.560, adj=0.091, (0 split)
## 
## Node number 8: 182 observations
##   mean=4.994505, MSE=0.5109588 
## 
## Node number 9: 1224 observations
##   mean=5.463235, MSE=0.5002823 
## 
## Node number 14: 611 observations
##   mean=6.296236, MSE=0.7322117 
## 
## Node number 15: 573 observations
##   mean=6.715532, MSE=0.6642788
install.packages("rpart.plot")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
# use the rpart.plot package to create a visualization
library(rpart.plot)
# a basic decision tree diagram
rpart.plot(m.rpart, digits = 3)

# a few adjustments to the diagram
rpart.plot(m.rpart, digits = 4, fallen.leaves = TRUE, type = 3, extra = 101)

Evaluate model performance

# generate predictions
p.rpart <- predict(m.rpart, whitewine_test)
# compare the distribution of predicted values vs actual values
summary(p.rpart)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.995   5.463   5.882   5.999   6.296   6.716
summary(whitewine_test$quality)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000   6.000   5.848   6.000   8.000
# compare the correlation
cor(p.rpart, whitewine_test$quality)
## [1] 0.4931608
# function to calculate the mean absolute error
MAE <- function(actual, predicted) {
  mean(abs(actual - predicted))  
}
# mean absolute error between predicted and actual values
MAE(p.rpart, whitewine_test$quality)
## [1] 0.5732104
# mean absolute error between actual values and mean value
mean(whitewine_train$quality) 
## [1] 5.886933
MAE(5.87, whitewine_test$quality)
## [1] 0.5815679

Improving model performance

install.packages("plyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
install.packages("Cubist")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(Cubist)
## Loading required package: lattice
m.cubist <- cubist(x = whitewine_train[-12], y = whitewine_train$quality)
m.cubist
## 
## Call:
## cubist.default(x = whitewine_train[-12], y = whitewine_train$quality)
## 
## Number of samples: 3750 
## Number of predictors: 11 
## 
## Number of committees: 1 
## Number of rules: 10
# display the tree itself
summary(m.cubist)
## 
## Call:
## cubist.default(x = whitewine_train[-12], y = whitewine_train$quality)
## 
## 
## Cubist [Release 2.07 GPL Edition]  Tue Feb  3 02:02:40 2026
## ---------------------------------
## 
##     Target attribute `outcome'
## 
## Read 3750 cases (12 attributes) from undefined.data
## 
## Model:
## 
##   Rule 1: [918 cases, mean 5.3, range 3 to 7, est err 0.5]
## 
##     if
##  volatile.acidity > 0.26
##  alcohol <= 10.2
##     then
##  outcome = 66.6 + 0.187 alcohol + 0.041 residual.sugar - 65 density
##            - 1.38 volatile.acidity + 0.5 pH + 0.0028 free.sulfur.dioxide
## 
##   Rule 2: [177 cases, mean 5.5, range 4 to 8, est err 0.5]
## 
##     if
##  citric.acid > 0.42
##  residual.sugar <= 14.05
##  free.sulfur.dioxide > 49
##     then
##  outcome = 32.5 + 0.379 alcohol - 0.024 residual.sugar - 31 density
##            - 0.54 volatile.acidity + 0.15 sulphates
##            + 0.0003 total.sulfur.dioxide + 0.07 pH + 0.4 chlorides
##            + 0.01 fixed.acidity
## 
##   Rule 3: [490 cases, mean 5.7, range 3 to 8, est err 0.5]
## 
##     if
##  volatile.acidity <= 0.26
##  residual.sugar <= 12.75
##  free.sulfur.dioxide <= 49
##  alcohol <= 10.2
##     then
##  outcome = 253.6 - 252 density + 0.102 residual.sugar
##            - 2.63 volatile.acidity + 0.0149 free.sulfur.dioxide
##            + 1.27 sulphates + 0.52 pH + 0.012 alcohol
## 
##   Rule 4: [71 cases, mean 5.8, range 5 to 7, est err 0.4]
## 
##     if
##  fixed.acidity <= 7.5
##  volatile.acidity <= 0.26
##  residual.sugar > 14.05
##  alcohol > 9.1
##     then
##  outcome = 127.2 - 125 density + 0.055 residual.sugar
##            - 2.47 volatile.acidity + 0.24 fixed.acidity + 0.67 sulphates
##            + 0.0017 total.sulfur.dioxide + 1.8 chlorides + 0.23 pH
##            - 0.0015 free.sulfur.dioxide + 0.013 alcohol
## 
##   Rule 5: [446 cases, mean 5.8, range 3 to 9, est err 0.5]
## 
##     if
##  citric.acid <= 0.42
##  residual.sugar <= 14.05
##  free.sulfur.dioxide > 49
##     then
##  outcome = 29.6 + 0.372 alcohol + 2.81 citric.acid
##            - 2.94 volatile.acidity - 28 density + 0.013 residual.sugar
##            + 0.13 sulphates + 0.0003 total.sulfur.dioxide
##            + 0.01 fixed.acidity
## 
##   Rule 6: [451 cases, mean 5.9, range 3 to 8, est err 0.7]
## 
##     if
##  free.sulfur.dioxide <= 20
##  alcohol > 10.2
##     then
##  outcome = 16.2 + 0.0537 free.sulfur.dioxide + 0.311 alcohol
##            - 2.63 volatile.acidity + 0.037 residual.sugar
##            - 0.2 fixed.acidity - 13 density + 0.08 pH
## 
##   Rule 7: [113 cases, mean 5.9, range 5 to 7, est err 0.5]
## 
##     if
##  fixed.acidity <= 7.5
##  volatile.acidity <= 0.26
##  residual.sugar > 14.05
##  alcohol <= 9.1
##     then
##  outcome = -8.3 + 2.204 alcohol - 0.143 residual.sugar
##            + 0.0066 total.sulfur.dioxide - 1.65 sulphates
##            - 0.0092 free.sulfur.dioxide - 3 density
## 
##   Rule 8: [35 cases, mean 6.2, range 3 to 8, est err 0.8]
## 
##     if
##  fixed.acidity > 7.5
##  volatile.acidity <= 0.26
##  residual.sugar > 14.05
##  alcohol <= 10.2
##     then
##  outcome = 29.5 - 0.451 residual.sugar - 19.04 volatile.acidity
##            - 0.804 alcohol - 39.4 chlorides + 0.0127 total.sulfur.dioxide
##            - 0.64 fixed.acidity
## 
##   Rule 9: [46 cases, mean 6.3, range 5 to 7, est err 0.4]
## 
##     if
##  volatile.acidity <= 0.26
##  residual.sugar > 12.75
##  residual.sugar <= 14.05
##  free.sulfur.dioxide <= 49
##  alcohol <= 10.2
##     then
##  outcome = 11.9 - 13.32 volatile.acidity + 0.0216 total.sulfur.dioxide
##            - 8.01 sulphates - 0.0521 free.sulfur.dioxide - 16.2 chlorides
## 
##   Rule 10: [1410 cases, mean 6.4, range 3 to 9, est err 0.6]
## 
##     if
##  free.sulfur.dioxide > 20
##  alcohol > 10.2
##     then
##  outcome = 247.3 - 250 density + 0.11 residual.sugar + 1.26 pH
##            + 0.116 alcohol + 1.04 sulphates + 0.11 fixed.acidity
##            - 0.26 volatile.acidity + 0.0012 free.sulfur.dioxide
## 
## 
## Evaluation on training data (3750 cases):
## 
##     Average  |error|                0.4
##     Relative |error|               0.63
##     Correlation coefficient        0.67
## 
## 
##  Attribute usage:
##    Conds  Model
## 
##     85%    99%    alcohol
##     73%    84%    free.sulfur.dioxide
##     40%    97%    volatile.acidity
##     33%    99%    residual.sugar
##     15%    11%    citric.acid
##      5%    62%    fixed.acidity
##            98%    density
##            85%    pH
##            66%    sulphates
##            21%    total.sulfur.dioxide
##             8%    chlorides
## 
## 
## Time: 0.2 secs
# generate predictions for the model
p.cubist <- predict(m.cubist, whitewine_test)
summary(p.cubist)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.315   5.574   6.093   6.028   6.437   7.647
# correlation between the predicted and true values
cor(p.cubist, whitewine_test$quality)
## [1] 0.5683117
# mean absolute error of predicted and true values
# (uses a custom function defined above)
MAE(whitewine_test$quality, p.cubist) 
## [1] 0.5306253

Red wine dataset

redwine <- read.csv("winequality-red.csv", stringsAsFactors = TRUE)
str(redwine)
## 'data.frame':    1599 obs. of  12 variables:
##  $ fixed.acidity       : num  7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
##  $ volatile.acidity    : num  0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
##  $ citric.acid         : num  0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
##  $ residual.sugar      : num  1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
##  $ chlorides           : num  0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
##  $ free.sulfur.dioxide : num  11 25 15 17 11 13 15 15 9 17 ...
##  $ total.sulfur.dioxide: num  34 67 54 60 34 40 59 21 18 102 ...
##  $ density             : num  0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num  3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
##  $ sulphates           : num  0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
##  $ alcohol             : num  9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
##  $ quality             : int  5 5 5 6 5 5 5 7 7 5 ...
# the distribution of quality ratings
hist(redwine$quality)

summary(redwine)
##  fixed.acidity   volatile.acidity  citric.acid    residual.sugar  
##  Min.   : 4.60   Min.   :0.1200   Min.   :0.000   Min.   : 0.900  
##  1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090   1st Qu.: 1.900  
##  Median : 7.90   Median :0.5200   Median :0.260   Median : 2.200  
##  Mean   : 8.32   Mean   :0.5278   Mean   :0.271   Mean   : 2.539  
##  3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420   3rd Qu.: 2.600  
##  Max.   :15.90   Max.   :1.5800   Max.   :1.000   Max.   :15.500  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.00       Min.   :0.9901  
##  1st Qu.:0.07000   1st Qu.: 7.00       1st Qu.: 22.00       1st Qu.:0.9956  
##  Median :0.07900   Median :14.00       Median : 38.00       Median :0.9968  
##  Mean   :0.08747   Mean   :15.87       Mean   : 46.47       Mean   :0.9967  
##  3rd Qu.:0.09000   3rd Qu.:21.00       3rd Qu.: 62.00       3rd Qu.:0.9978  
##  Max.   :0.61100   Max.   :72.00       Max.   :289.00       Max.   :1.0037  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.740   Min.   :0.3300   Min.   : 8.40   Min.   :3.000  
##  1st Qu.:3.210   1st Qu.:0.5500   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.310   Median :0.6200   Median :10.20   Median :6.000  
##  Mean   :3.311   Mean   :0.6581   Mean   :10.42   Mean   :5.636  
##  3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90   Max.   :8.000
redwine_train <- redwine[1:3750, ]
redwine_test <- redwine[1201:1599, ]

Training a model on the data

library(rpart)
m.rpart <- rpart(quality ~ ., data = redwine_train)
# get basic information about the tree
m.rpart
## n=1599 (2151 observations deleted due to missingness)
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 1599 1042.16500 5.636023  
##    2) alcohol< 10.525 983  424.15870 5.366226  
##      4) sulphates< 0.575 391  128.09720 5.150895 *
##      5) sulphates>=0.575 592  265.95780 5.508446  
##       10) volatile.acidity>=0.405 448  175.87280 5.404018 *
##       11) volatile.acidity< 0.405 144   70.00000 5.833333 *
##    3) alcohol>=10.525 616  432.27110 6.066558  
##      6) sulphates< 0.645 272  191.86760 5.727941  
##       12) volatile.acidity>=1.015 10    6.00000 4.000000 *
##       13) volatile.acidity< 1.015 262  154.87020 5.793893  
##         26) volatile.acidity>=0.495 146   73.67123 5.575342 *
##         27) volatile.acidity< 0.495 116   65.44828 6.068966 *
##      7) sulphates>=0.645 344  184.55520 6.334302  
##       14) alcohol< 11.55 206  101.96600 6.121359  
##         28) volatile.acidity>=0.395 111   37.42342 5.927928 *
##         29) volatile.acidity< 0.395 95   55.53684 6.347368  
##           58) pH>=3.255 59   29.72881 6.067797 *
##           59) pH< 3.255 36   13.63889 6.805556 *
##       15) alcohol>=11.55 138   59.30435 6.652174 *
# get more detailed information about the tree
summary(m.rpart)
## Call:
## rpart(formula = quality ~ ., data = redwine_train)
##   n=1599 (2151 observations deleted due to missingness)
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.17822061      0 1.0000000 1.0009884 0.03792922
## 2 0.05358865      1 0.8217794 0.8363434 0.03654274
## 3 0.02974329      2 0.7681907 0.8079661 0.03419475
## 4 0.02888577      3 0.7384474 0.7883367 0.03371542
## 5 0.02234278      4 0.7095617 0.7597444 0.03163502
## 6 0.01927238      5 0.6872189 0.7364469 0.03042210
## 7 0.01511346      6 0.6679465 0.7138108 0.02942769
## 8 0.01015909      7 0.6528331 0.7004774 0.02834643
## 9 0.01000000      9 0.6325149 0.6956021 0.02861206
## 
## Variable importance
##              alcohol     volatile.acidity              density 
##                   31                   17                   13 
##            sulphates        fixed.acidity            chlorides 
##                   13                    7                    6 
##          citric.acid                   pH total.sulfur.dioxide 
##                    5                    4                    3 
##  free.sulfur.dioxide 
##                    1 
## 
## Node number 1: 1599 observations,    complexity param=0.1782206
##   mean=5.636023, MSE=0.6517605 
##   left son=2 (983 obs) right son=3 (616 obs)
##   Primary splits:
##       alcohol          < 10.525   to the left,  improve=0.17822060, (0 missing)
##       sulphates        < 0.645    to the left,  improve=0.12565160, (0 missing)
##       volatile.acidity < 0.425    to the right, improve=0.11400620, (0 missing)
##       citric.acid      < 0.295    to the left,  improve=0.07225368, (0 missing)
##       density          < 0.99539  to the right, improve=0.06402980, (0 missing)
##   Surrogate splits:
##       density              < 0.995575 to the right, agree=0.762, adj=0.383, (0 split)
##       chlorides            < 0.0685   to the right, agree=0.690, adj=0.195, (0 split)
##       volatile.acidity     < 0.3675   to the right, agree=0.662, adj=0.123, (0 split)
##       fixed.acidity        < 6.75     to the right, agree=0.654, adj=0.101, (0 split)
##       total.sulfur.dioxide < 17.5     to the right, agree=0.641, adj=0.068, (0 split)
## 
## Node number 2: 983 observations,    complexity param=0.02888577
##   mean=5.366226, MSE=0.4314941 
##   left son=4 (391 obs) right son=5 (592 obs)
##   Primary splits:
##       sulphates            < 0.575    to the left,  improve=0.07097282, (0 missing)
##       volatile.acidity     < 0.335    to the right, improve=0.06388554, (0 missing)
##       alcohol              < 9.85     to the left,  improve=0.05212216, (0 missing)
##       fixed.acidity        < 10.85    to the left,  improve=0.03084011, (0 missing)
##       total.sulfur.dioxide < 83.5     to the right, improve=0.02749674, (0 missing)
##   Surrogate splits:
##       density              < 0.996225 to the left,  agree=0.662, adj=0.151, (0 split)
##       volatile.acidity     < 0.6525   to the right, agree=0.636, adj=0.084, (0 split)
##       fixed.acidity        < 6.05     to the left,  agree=0.609, adj=0.018, (0 split)
##       citric.acid          < 0.115    to the left,  agree=0.609, adj=0.018, (0 split)
##       total.sulfur.dioxide < 9.5      to the left,  agree=0.608, adj=0.015, (0 split)
## 
## Node number 3: 616 observations,    complexity param=0.05358865
##   mean=6.066558, MSE=0.7017388 
##   left son=6 (272 obs) right son=7 (344 obs)
##   Primary splits:
##       sulphates        < 0.645    to the left,  improve=0.12919720, (0 missing)
##       volatile.acidity < 0.87     to the right, improve=0.11482610, (0 missing)
##       citric.acid      < 0.295    to the left,  improve=0.10819510, (0 missing)
##       alcohol          < 11.55    to the left,  improve=0.10309310, (0 missing)
##       pH               < 3.355    to the right, improve=0.07557599, (0 missing)
##   Surrogate splits:
##       citric.acid      < 0.245    to the left,  agree=0.683, adj=0.283, (0 split)
##       fixed.acidity    < 7.85     to the left,  agree=0.666, adj=0.243, (0 split)
##       volatile.acidity < 0.5875   to the right, agree=0.653, adj=0.213, (0 split)
##       density          < 0.994915 to the left,  agree=0.635, adj=0.173, (0 split)
##       pH               < 3.405    to the right, agree=0.630, adj=0.162, (0 split)
## 
## Node number 4: 391 observations
##   mean=5.150895, MSE=0.3276143 
## 
## Node number 5: 592 observations,    complexity param=0.01927238
##   mean=5.508446, MSE=0.449253 
##   left son=10 (448 obs) right son=11 (144 obs)
##   Primary splits:
##       volatile.acidity     < 0.405    to the right, improve=0.07551952, (0 missing)
##       total.sulfur.dioxide < 81.5     to the right, improve=0.05845854, (0 missing)
##       alcohol              < 9.85     to the left,  improve=0.05386312, (0 missing)
##       fixed.acidity        < 10.95    to the left,  improve=0.05335172, (0 missing)
##       chlorides            < 0.0975   to the right, improve=0.03262428, (0 missing)
##   Surrogate splits:
##       fixed.acidity       < 10.45    to the left,  agree=0.787, adj=0.125, (0 split)
##       chlorides           < 0.0565   to the right, agree=0.765, adj=0.035, (0 split)
##       citric.acid         < 0.365    to the left,  agree=0.764, adj=0.028, (0 split)
##       free.sulfur.dioxide < 2.5      to the right, agree=0.764, adj=0.028, (0 split)
##       alcohol             < 8.6      to the right, agree=0.758, adj=0.007, (0 split)
## 
## Node number 6: 272 observations,    complexity param=0.02974329
##   mean=5.727941, MSE=0.7053958 
##   left son=12 (10 obs) right son=13 (262 obs)
##   Primary splits:
##       volatile.acidity < 1.015    to the right, improve=0.16155630, (0 missing)
##       alcohol          < 11.45    to the left,  improve=0.11901850, (0 missing)
##       citric.acid      < 0.255    to the left,  improve=0.11313180, (0 missing)
##       pH               < 3.365    to the right, improve=0.09055459, (0 missing)
##       sulphates        < 0.585    to the left,  improve=0.04970438, (0 missing)
## 
## Node number 7: 344 observations,    complexity param=0.02234278
##   mean=6.334302, MSE=0.5364978 
##   left son=14 (206 obs) right son=15 (138 obs)
##   Primary splits:
##       alcohol              < 11.55    to the left,  improve=0.12616750, (0 missing)
##       chlorides            < 0.0785   to the right, improve=0.05765389, (0 missing)
##       total.sulfur.dioxide < 101.5    to the right, improve=0.05496021, (0 missing)
##       density              < 0.99537  to the right, improve=0.04412990, (0 missing)
##       volatile.acidity     < 0.425    to the right, improve=0.04136603, (0 missing)
##   Surrogate splits:
##       density        < 0.994875 to the right, agree=0.701, adj=0.254, (0 split)
##       chlorides      < 0.053    to the right, agree=0.651, adj=0.130, (0 split)
##       fixed.acidity  < 5.55     to the right, agree=0.640, adj=0.101, (0 split)
##       residual.sugar < 4.25     to the left,  agree=0.628, adj=0.072, (0 split)
##       citric.acid    < 0.635    to the left,  agree=0.622, adj=0.058, (0 split)
## 
## Node number 10: 448 observations
##   mean=5.404018, MSE=0.3925731 
## 
## Node number 11: 144 observations
##   mean=5.833333, MSE=0.4861111 
## 
## Node number 12: 10 observations
##   mean=4, MSE=0.6 
## 
## Node number 13: 262 observations,    complexity param=0.01511346
##   mean=5.793893, MSE=0.5911077 
##   left son=26 (146 obs) right son=27 (116 obs)
##   Primary splits:
##       volatile.acidity < 0.495    to the right, improve=0.10170270, (0 missing)
##       alcohol          < 11.45    to the left,  improve=0.09838534, (0 missing)
##       citric.acid      < 0.255    to the left,  improve=0.09415346, (0 missing)
##       pH               < 3.295    to the right, improve=0.07618253, (0 missing)
##       density          < 0.995155 to the right, improve=0.05214905, (0 missing)
##   Surrogate splits:
##       citric.acid          < 0.235    to the left,  agree=0.866, adj=0.698, (0 split)
##       pH                   < 3.305    to the right, agree=0.733, adj=0.397, (0 split)
##       fixed.acidity        < 7.85     to the left,  agree=0.691, adj=0.302, (0 split)
##       alcohol              < 11.85    to the left,  agree=0.641, adj=0.190, (0 split)
##       total.sulfur.dioxide < 12.5     to the right, agree=0.637, adj=0.181, (0 split)
## 
## Node number 14: 206 observations,    complexity param=0.01015909
##   mean=6.121359, MSE=0.4949807 
##   left son=28 (111 obs) right son=29 (95 obs)
##   Primary splits:
##       volatile.acidity     < 0.395    to the right, improve=0.08832113, (0 missing)
##       total.sulfur.dioxide < 49.5     to the right, improve=0.06808035, (0 missing)
##       chlorides            < 0.0945   to the right, improve=0.05079896, (0 missing)
##       citric.acid          < 0.295    to the left,  improve=0.05051307, (0 missing)
##       free.sulfur.dioxide  < 25.5     to the right, improve=0.03611908, (0 missing)
##   Surrogate splits:
##       citric.acid    < 0.285    to the left,  agree=0.733, adj=0.421, (0 split)
##       sulphates      < 0.765    to the left,  agree=0.655, adj=0.253, (0 split)
##       chlorides      < 0.0675   to the right, agree=0.617, adj=0.168, (0 split)
##       residual.sugar < 1.85     to the right, agree=0.612, adj=0.158, (0 split)
##       fixed.acidity  < 7.05     to the left,  agree=0.597, adj=0.126, (0 split)
## 
## Node number 15: 138 observations
##   mean=6.652174, MSE=0.4297417 
## 
## Node number 26: 146 observations
##   mean=5.575342, MSE=0.5045975 
## 
## Node number 27: 116 observations
##   mean=6.068966, MSE=0.5642093 
## 
## Node number 28: 111 observations
##   mean=5.927928, MSE=0.337148 
## 
## Node number 29: 95 observations,    complexity param=0.01015909
##   mean=6.347368, MSE=0.5845983 
##   left son=58 (59 obs) right son=59 (36 obs)
##   Primary splits:
##       pH                   < 3.255    to the right, improve=0.21911830, (0 missing)
##       total.sulfur.dioxide < 56.5     to the right, improve=0.18528400, (0 missing)
##       fixed.acidity        < 10       to the left,  improve=0.12899290, (0 missing)
##       free.sulfur.dioxide  < 24.5     to the right, improve=0.11666000, (0 missing)
##       alcohol              < 10.75    to the left,  improve=0.05498168, (0 missing)
##   Surrogate splits:
##       fixed.acidity        < 9.7      to the left,  agree=0.737, adj=0.306, (0 split)
##       total.sulfur.dioxide < 28.5     to the right, agree=0.737, adj=0.306, (0 split)
##       free.sulfur.dioxide  < 9.5      to the right, agree=0.716, adj=0.250, (0 split)
##       chlorides            < 0.0635   to the right, agree=0.663, adj=0.111, (0 split)
##       sulphates            < 0.935    to the left,  agree=0.663, adj=0.111, (0 split)
## 
## Node number 58: 59 observations
##   mean=6.067797, MSE=0.5038782 
## 
## Node number 59: 36 observations
##   mean=6.805556, MSE=0.378858
# use the rpart.plot package to create a visualization
library(rpart.plot)
# a basic decision tree diagram
rpart.plot(m.rpart, digits = 3)

# a few adjustments to the diagram
rpart.plot(m.rpart, digits = 4, fallen.leaves = TRUE, type = 3, extra = 101)

Evaluate model performance

# generate predictions
p.rpart <- predict(m.rpart, redwine_test)
# compare the distribution of predicted values vs actual values
summary(p.rpart)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.000   5.151   5.404   5.614   5.928   6.806
summary(redwine_test$quality)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000   6.000   5.549   6.000   8.000
# compare the correlation
cor(p.rpart, redwine_test$quality)
## [1] 0.5588764
# function to calculate the mean absolute error
MAE <- function(actual, predicted) {
  mean(abs(actual - predicted))  
}
# mean absolute error between predicted and actual values
MAE(p.rpart, redwine_test$quality)
## [1] 0.5137012
# mean absolute error between actual values and mean value
mean(redwine_train$quality) 
## [1] NA
MAE(5.87, redwine_test$quality)
## [1] 0.6521554

Improving model performance

library(Cubist)
m.cubist <- cubist(x = redwine_train[-12], y = redwine_train$quality)
m.cubist
## 
## Call:
## cubist.default(x = redwine_train[-12], y = redwine_train$quality)
## 
## Number of samples: 3750 
## Number of predictors: 11 
## 
## Number of committees: 1 
## Number of rules: 7
# display the tree itself
summary(m.cubist)
## 
## Call:
## cubist.default(x = redwine_train[-12], y = redwine_train$quality)
## 
## 
## Cubist [Release 2.07 GPL Edition]  Tue Feb  3 02:02:41 2026
## ---------------------------------
## 
##     Target attribute `outcome'
## 
## *** Ignoring cases with unknown or N/A target value
## 
## Read 1599 cases (12 attributes) from undefined.data
## 
## Model:
## 
##   Rule 1: [630 cases, mean 5.3, range 3 to 8, est err 0.4]
## 
##     if
##  alcohol <= 9.8
##     then
##  outcome = 5 - 0.79 volatile.acidity - 0.099 alcohol
##            + 0.052 fixed.acidity - 0.31 citric.acid + 0.33 sulphates
##            + 0.29 pH - 0.0031 free.sulfur.dioxide
##            - 0.0007 total.sulfur.dioxide - 0.4 chlorides
## 
##   Rule 2: [589 cases, mean 5.3, range 3 to 8, est err 0.4]
## 
##     if
##  sulphates <= 0.92
##  alcohol <= 9.8
##     then
##  outcome = 5.5 + 1.28 sulphates - 0.9 volatile.acidity - 0.33 citric.acid
##            + 0.029 fixed.acidity - 0.033 alcohol
##            - 0.0008 total.sulfur.dioxide - 0.0023 free.sulfur.dioxide
##            - 0.4 chlorides - 0.1 pH
## 
##   Rule 3: [80 cases, mean 5.3, range 3 to 7, est err 0.7]
## 
##     if
##  volatile.acidity > 0.31
##  total.sulfur.dioxide <= 14
##  sulphates <= 0.63
##  alcohol > 9.8
##     then
##  outcome = 0.5 + 0.549 alcohol - 1.61 volatile.acidity + 0.36 sulphates
##            - 0.18 pH - 0.0005 total.sulfur.dioxide - 0.07 citric.acid
##            + 0.001 free.sulfur.dioxide
## 
##   Rule 4: [340 cases, mean 5.6, range 4 to 7, est err 0.5]
## 
##     if
##  volatile.acidity > 0.31
##  total.sulfur.dioxide > 14
##  sulphates <= 0.63
##  alcohol > 9.8
##     then
##  outcome = 5.1 + 2.85 sulphates + 0.19 alcohol - 0.74 citric.acid
##            - 0.69 volatile.acidity - 0.74 pH
##            - 0.0027 total.sulfur.dioxide + 0.0013 free.sulfur.dioxide
## 
##   Rule 5: [407 cases, mean 6.1, range 3 to 8, est err 0.6]
## 
##     if
##  volatile.acidity > 0.31
##  sulphates > 0.63
##  alcohol > 9.8
##     then
##  outcome = 7.6 + 0.309 alcohol - 0.0073 total.sulfur.dioxide - 1.12 pH
##            - 0.81 volatile.acidity - 0.079 fixed.acidity + 0.22 sulphates
##            + 0.002 free.sulfur.dioxide
## 
##   Rule 6: [71 cases, mean 6.2, range 5 to 8, est err 0.5]
## 
##     if
##  volatile.acidity <= 0.31
##  sulphates <= 0.73
##  alcohol > 9.8
##     then
##  outcome = 131.4 + 4.85 volatile.acidity - 124 density - 1.35 pH
##            + 0.056 fixed.acidity + 0.54 sulphates + 0.036 alcohol
##            + 0.021 residual.sugar
## 
##   Rule 7: [85 cases, mean 6.5, range 5 to 8, est err 0.4]
## 
##     if
##  volatile.acidity <= 0.31
##  sulphates > 0.73
##     then
##  outcome = 17 + 0.39 alcohol + 0.113 fixed.acidity
##            + 0.25 volatile.acidity - 16 density + 0.14 sulphates
## 
## 
## Evaluation on training data (1599 cases):
## 
##     Average  |error|                0.4
##     Relative |error|               0.62
##     Correlation coefficient        0.62
## 
## 
##  Attribute usage:
##    Conds  Model
## 
##     96%   100%    alcohol
##     71%   100%    sulphates
##     45%   100%    volatile.acidity
##     19%    93%    total.sulfur.dioxide
##            96%    pH
##            93%    free.sulfur.dioxide
##            81%    fixed.acidity
##            74%    citric.acid
##            55%    chlorides
##             7%    density
##             3%    residual.sugar
## 
## 
## Time: 0.0 secs
# generate predictions for the model
p.cubist <- predict(m.cubist, redwine_test)
summary(p.cubist)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.455   5.109   5.639   5.592   5.974   7.045
# correlation between the predicted and true values
cor(p.cubist, redwine_test$quality)
## [1] 0.6402049
# mean absolute error of predicted and true values
# (uses a custom function defined above)
MAE(redwine_test$quality, p.cubist) 
## [1] 0.4606379