setwd("~/Assignment1_files")
wine <- read.csv("winequality-red.csv")
head(wine)
# CCheck missing values
colSums(is.na(wine))
fixed.acidity volatile.acidity citric.acid residual.sugar
0 0 0 0
chlorides free.sulfur.dioxide total.sulfur.dioxide density
0 0 0 0
pH sulphates alcohol quality
0 0 0 0
# examine data
str(wine)
'data.frame': 1599 obs. of 12 variables:
$ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
$ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
$ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
$ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
$ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
$ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
$ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
$ density : num 0.998 0.997 0.997 0.998 0.998 ...
$ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
$ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
$ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
$ quality : int 5 5 5 6 5 5 5 7 7 5 ...
# the distribution of quality ratings
hist(wine$quality, col="orange", main = "Histogram: Red Wine Quality", xlab="Quality", ylab="Frequency")
# summary statistics of the wine data
summary(wine)
fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900 Min. :0.01200
1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900 1st Qu.:0.07000
Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200 Median :0.07900
Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539 Mean :0.08747
3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600 3rd Qu.:0.09000
Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500 Max. :0.61100
free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
Min. : 1.00 Min. : 6.00 Min. :0.9901 Min. :2.740 Min. :0.3300
1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956 1st Qu.:3.210 1st Qu.:0.5500
Median :14.00 Median : 38.00 Median :0.9968 Median :3.310 Median :0.6200
Mean :15.87 Mean : 46.47 Mean :0.9967 Mean :3.311 Mean :0.6581
3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978 3rd Qu.:3.400 3rd Qu.:0.7300
Max. :72.00 Max. :289.00 Max. :1.0037 Max. :4.010 Max. :2.0000
alcohol quality
Min. : 8.40 Min. :3.000
1st Qu.: 9.50 1st Qu.:5.000
Median :10.20 Median :6.000
Mean :10.42 Mean :5.636
3rd Qu.:11.10 3rd Qu.:6.000
Max. :14.90 Max. :8.000
set.seed(123)
train_index <- sample(1:nrow(wine), 0.8 * nrow(wine))
wine_train <- wine[train_index, ]
wine_test <- wine[-train_index, ]
# regression tree using rpart
library(rpart)
m.rpart <- rpart(quality ~ ., data = wine_train)
# get basic information about the tree
m.rpart
n= 1279
node), split, n, deviance, yval
* denotes terminal node
1) root 1279 864.63170 5.655981
2) alcohol< 10.525 768 346.71740 5.368490
4) sulphates< 0.555 259 79.28958 5.119691 *
5) sulphates>=0.555 509 243.23770 5.495088
10) volatile.acidity>=0.365 439 190.87930 5.419134
20) total.sulfur.dioxide>=65.5 116 23.79310 5.137931 *
21) total.sulfur.dioxide< 65.5 323 154.61920 5.520124 *
11) volatile.acidity< 0.365 70 33.94286 5.971429 *
3) alcohol>=10.525 511 359.03720 6.088063
6) sulphates< 0.585 140 107.39290 5.607143
12) volatile.acidity>=0.385 103 70.46602 5.388350
24) free.sulfur.dioxide< 5.5 21 15.80952 4.761905 *
25) free.sulfur.dioxide>=5.5 82 44.30488 5.548780 *
13) volatile.acidity< 0.385 37 18.27027 6.216216 *
7) sulphates>=0.585 371 207.04580 6.269542
14) alcohol< 11.55 217 109.09680 6.064516 *
15) alcohol>=11.55 154 75.97403 6.558442 *
# get more detailed information about the tree
summary(m.rpart)
Call:
rpart(formula = quality ~ ., data = wine_train)
n= 1279
CP nsplit rel error xerror xstd
1 0.18375119 0 1.0000000 1.0009026 0.04165519
2 0.05158092 1 0.8162488 0.8255477 0.03975751
3 0.02797740 2 0.7646679 0.7798956 0.03619021
4 0.02541547 3 0.7366905 0.7836549 0.03670417
5 0.02157747 4 0.7112750 0.7775538 0.03637259
6 0.02129877 5 0.6896975 0.7726727 0.03630100
7 0.01441882 6 0.6683988 0.7351590 0.03511256
8 0.01197228 7 0.6539799 0.7227261 0.03484660
9 0.01000000 8 0.6420077 0.7253546 0.03530394
Variable importance
alcohol density sulphates volatile.acidity
34 14 13 13
chlorides total.sulfur.dioxide fixed.acidity free.sulfur.dioxide
7 7 5 3
citric.acid pH
2 1
Node number 1: 1279 observations, complexity param=0.1837512
mean=5.655981, MSE=0.6760217
left son=2 (768 obs) right son=3 (511 obs)
Primary splits:
alcohol < 10.525 to the left, improve=0.18375120, (0 missing)
sulphates < 0.645 to the left, improve=0.11661770, (0 missing)
volatile.acidity < 0.385 to the right, improve=0.10458550, (0 missing)
citric.acid < 0.305 to the left, improve=0.07507874, (0 missing)
density < 0.995565 to the right, improve=0.05857806, (0 missing)
Surrogate splits:
density < 0.99575 to the right, agree=0.760, adj=0.399, (0 split)
chlorides < 0.0685 to the right, agree=0.687, adj=0.217, (0 split)
volatile.acidity < 0.375 to the right, agree=0.658, adj=0.143, (0 split)
fixed.acidity < 6.75 to the right, agree=0.642, adj=0.104, (0 split)
total.sulfur.dioxide < 17.5 to the right, agree=0.635, adj=0.086, (0 split)
Node number 2: 768 observations, complexity param=0.0279774
mean=5.36849, MSE=0.451455
left son=4 (259 obs) right son=5 (509 obs)
Primary splits:
sulphates < 0.555 to the left, improve=0.06976906, (0 missing)
volatile.acidity < 0.345 to the right, improve=0.05971706, (0 missing)
alcohol < 9.975 to the left, improve=0.05678445, (0 missing)
fixed.acidity < 9.95 to the left, improve=0.03851320, (0 missing)
total.sulfur.dioxide < 83.5 to the right, improve=0.03270979, (0 missing)
Surrogate splits:
density < 0.996225 to the left, agree=0.702, adj=0.116, (0 split)
volatile.acidity < 0.7975 to the right, agree=0.681, adj=0.054, (0 split)
fixed.acidity < 6.05 to the left, agree=0.673, adj=0.031, (0 split)
total.sulfur.dioxide < 9.5 to the left, agree=0.669, adj=0.019, (0 split)
chlorides < 0.046 to the left, agree=0.665, adj=0.008, (0 split)
Node number 3: 511 observations, complexity param=0.05158092
mean=6.088063, MSE=0.7026168
left son=6 (140 obs) right son=7 (371 obs)
Primary splits:
sulphates < 0.585 to the left, improve=0.12421690, (0 missing)
citric.acid < 0.295 to the left, improve=0.09324665, (0 missing)
alcohol < 11.55 to the left, improve=0.09292045, (0 missing)
volatile.acidity < 0.87 to the right, improve=0.08900247, (0 missing)
pH < 3.355 to the right, improve=0.06547522, (0 missing)
Surrogate splits:
volatile.acidity < 0.7025 to the right, agree=0.746, adj=0.071, (0 split)
total.sulfur.dioxide < 9.5 to the left, agree=0.744, adj=0.064, (0 split)
citric.acid < 0.015 to the left, agree=0.734, adj=0.029, (0 split)
residual.sugar < 7.65 to the right, agree=0.734, adj=0.029, (0 split)
chlorides < 0.023 to the left, agree=0.730, adj=0.014, (0 split)
Node number 4: 259 observations
mean=5.119691, MSE=0.3061374
Node number 5: 509 observations, complexity param=0.02129877
mean=5.495088, MSE=0.4778737
left son=10 (439 obs) right son=11 (70 obs)
Primary splits:
volatile.acidity < 0.365 to the right, improve=0.07571027, (0 missing)
alcohol < 9.85 to the left, improve=0.06851443, (0 missing)
total.sulfur.dioxide < 46.5 to the right, improve=0.06608195, (0 missing)
fixed.acidity < 9.95 to the left, improve=0.04635665, (0 missing)
pH < 3.535 to the right, improve=0.02682268, (0 missing)
Surrogate splits:
chlorides < 0.0565 to the right, agree=0.872, adj=0.071, (0 split)
citric.acid < 0.685 to the left, agree=0.864, adj=0.014, (0 split)
free.sulfur.dioxide < 1.5 to the right, agree=0.864, adj=0.014, (0 split)
Node number 6: 140 observations, complexity param=0.02157747
mean=5.607143, MSE=0.7670918
left son=12 (103 obs) right son=13 (37 obs)
Primary splits:
volatile.acidity < 0.385 to the right, improve=0.17372260, (0 missing)
citric.acid < 0.255 to the left, improve=0.14724590, (0 missing)
alcohol < 11.45 to the left, improve=0.12835050, (0 missing)
pH < 3.365 to the right, improve=0.11277110, (0 missing)
free.sulfur.dioxide < 26.5 to the left, improve=0.08503115, (0 missing)
Surrogate splits:
citric.acid < 0.315 to the left, agree=0.864, adj=0.486, (0 split)
pH < 3.275 to the right, agree=0.807, adj=0.270, (0 split)
fixed.acidity < 8.55 to the left, agree=0.786, adj=0.189, (0 split)
free.sulfur.dioxide < 3.5 to the right, agree=0.771, adj=0.135, (0 split)
total.sulfur.dioxide < 8.5 to the right, agree=0.771, adj=0.135, (0 split)
Node number 7: 371 observations, complexity param=0.02541547
mean=6.269542, MSE=0.558075
left son=14 (217 obs) right son=15 (154 obs)
Primary splits:
alcohol < 11.55 to the left, improve=0.10613600, (0 missing)
volatile.acidity < 0.87 to the right, improve=0.05553900, (0 missing)
sulphates < 0.685 to the left, improve=0.04775179, (0 missing)
chlorides < 0.0945 to the right, improve=0.03855994, (0 missing)
citric.acid < 0.295 to the left, improve=0.03625129, (0 missing)
Surrogate splits:
density < 0.994765 to the right, agree=0.714, adj=0.312, (0 split)
fixed.acidity < 5.7 to the right, agree=0.644, adj=0.143, (0 split)
chlorides < 0.0525 to the right, agree=0.628, adj=0.104, (0 split)
pH < 3.535 to the left, agree=0.623, adj=0.091, (0 split)
total.sulfur.dioxide < 69.5 to the left, agree=0.612, adj=0.065, (0 split)
Node number 10: 439 observations, complexity param=0.01441882
mean=5.419134, MSE=0.4348047
left son=20 (116 obs) right son=21 (323 obs)
Primary splits:
total.sulfur.dioxide < 65.5 to the right, improve=0.06531339, (0 missing)
alcohol < 9.975 to the left, improve=0.05602606, (0 missing)
fixed.acidity < 9.95 to the left, improve=0.02962558, (0 missing)
pH < 3.535 to the right, improve=0.02752508, (0 missing)
volatile.acidity < 0.6175 to the right, improve=0.02592117, (0 missing)
Surrogate splits:
free.sulfur.dioxide < 19.5 to the right, agree=0.759, adj=0.086, (0 split)
residual.sugar < 3.25 to the right, agree=0.756, adj=0.078, (0 split)
sulphates < 1.6 to the right, agree=0.745, adj=0.034, (0 split)
volatile.acidity < 0.97 to the right, agree=0.743, adj=0.026, (0 split)
density < 1.00231 to the right, agree=0.740, adj=0.017, (0 split)
Node number 11: 70 observations
mean=5.971429, MSE=0.484898
Node number 12: 103 observations, complexity param=0.01197228
mean=5.38835, MSE=0.6841361
left son=24 (21 obs) right son=25 (82 obs)
Primary splits:
free.sulfur.dioxide < 5.5 to the left, improve=0.14690230, (0 missing)
alcohol < 11.45 to the left, improve=0.09834283, (0 missing)
volatile.acidity < 0.9125 to the right, improve=0.09768146, (0 missing)
total.sulfur.dioxide < 27.5 to the left, improve=0.06906661, (0 missing)
pH < 3.355 to the right, improve=0.04596367, (0 missing)
Surrogate splits:
total.sulfur.dioxide < 10.5 to the left, agree=0.854, adj=0.286, (0 split)
volatile.acidity < 1.1375 to the right, agree=0.825, adj=0.143, (0 split)
fixed.acidity < 7.9 to the right, agree=0.816, adj=0.095, (0 split)
density < 0.996995 to the right, agree=0.806, adj=0.048, (0 split)
pH < 3.62 to the right, agree=0.806, adj=0.048, (0 split)
Node number 13: 37 observations
mean=6.216216, MSE=0.4937911
Node number 14: 217 observations
mean=6.064516, MSE=0.5027501
Node number 15: 154 observations
mean=6.558442, MSE=0.4933378
Node number 20: 116 observations
mean=5.137931, MSE=0.205113
Node number 21: 323 observations
mean=5.520124, MSE=0.4786972
Node number 24: 21 observations
mean=4.761905, MSE=0.7528345
Node number 25: 82 observations
mean=5.54878, MSE=0.5403034
library(rpart.plot)
# a basic decision tree diagram
rpart.plot(m.rpart, digits = 3)
# a few adjustments to the diagram
rpart.plot(m.rpart, digits = 4, fallen.leaves = TRUE, type = 3, extra = 101)
# generate predictions for the testing dataset
p.rpart <- predict(m.rpart, wine_test)
# compare the distribution of predicted values vs. actual values
summary(p.rpart)
Min. 1st Qu. Median Mean 3rd Qu. Max.
4.762 5.138 5.520 5.599 6.065 6.558
summary(wine_test$quality)
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.000 5.000 5.500 5.556 6.000 8.000
# compare the correlation
cor(p.rpart, wine_test$quality)
[1] 0.5381298
# function to calculate the mean absolute error
MAE <- function(actual, predicted) {
mean(abs(actual - predicted))
}
# mean absolute error between predicted and actual values
MAE(p.rpart, wine_test$quality)
[1] 0.4738731
# mean absolute error between actual values and mean value
mean(wine_train$quality)
[1] 5.655981
MAE(5.87, wine_test$quality)
[1] 0.6375
library(Cubist)
library(plyr)
m.cubist <- cubist(x = wine_train[-12], y = wine_train$quality)
# display basic information about the model tree
m.cubist
Call:
cubist.default(x = wine_train[-12], y = wine_train$quality)
Number of samples: 1279
Number of predictors: 11
Number of committees: 1
Number of rules: 4
# display the tree itself
summary(m.cubist)
Call:
cubist.default(x = wine_train[-12], y = wine_train$quality)
Cubist [Release 2.07 GPL Edition] Thu Feb 5 14:23:13 2026
---------------------------------
Target attribute `outcome'
Read 1279 cases (12 attributes) from undefined.data
Model:
Rule 1: [385 cases, mean 5.2, range 3 to 7, est err 0.3]
if
alcohol <= 9.6
then
outcome = 5
Rule 2: [320 cases, mean 5.3, range 3 to 7, est err 0.4]
if
total.sulfur.dioxide <= 105
sulphates <= 0.92
alcohol <= 9.6
then
outcome = 4.3 + 1.47 sulphates - 0.24 volatile.acidity
- 0.0012 total.sulfur.dioxide + 0.016 alcohol
Rule 3: [65 cases, mean 5.6, range 4 to 8, est err 0.7]
if
sulphates > 0.92
then
outcome = -22.6 - 0.002 total.sulfur.dioxide + 28 density
- 0.28 volatile.acidity
Rule 4: [894 cases, mean 5.8, range 3 to 8, est err 0.6]
if
alcohol > 9.6
then
outcome = 4.8 + 0.299 alcohol - 1.03 volatile.acidity + 0.9 sulphates
- 0.004 total.sulfur.dioxide - 0.64 pH
+ 0.0065 free.sulfur.dioxide - 1.3 chlorides
Evaluation on training data (1279 cases):
Average |error| 0.5
Relative |error| 0.66
Correlation coefficient 0.59
Attribute usage:
Conds Model
96% 73% alcohol
23% 73% sulphates
19% 77% total.sulfur.dioxide
77% volatile.acidity
54% chlorides
54% free.sulfur.dioxide
54% pH
4% density
Time: 0.0 secs
# generate predictions for the model
p.cubist <- predict(m.cubist, wine_test)
# summary statistics about the predictions
summary(p.cubist)
Min. 1st Qu. Median Mean 3rd Qu. Max.
4.634 5.082 5.373 5.502 5.832 6.781
# correlation between the predicted and true values
cor(p.cubist, wine_test$quality)
[1] 0.5765034
# mean absolute error of predicted and true values
MAE(wine_test$quality, p.cubist)