##Step 1 - collecting data
#The data include examples of red and white Vinho Verde wines from Portugal-one of the world's leading wine-producing countries. Because the factors that contribute to a highly-rated wine may differ between the red and white varieties, for this analysis, we will examine only the more popular white wines.The white wine data includes information on 11 chemical properties of 4,898 wine samples. For each wine, a laboratory analysis measured characteristics such as the acidity, sugar content, chlorides, sulfur, alcohol, pH, and density. The samples were then rated in a blind tasting by panels of no less than three judges on a quality scale ranging from zero (very bad) to 10 (excellent). In the case that the judges disagreed on the rating, the median value was used.
## Step 2: Exploring and preparing the data ----
wine <- read.csv("http://www.sci.csueastbay.edu/~esuess/classes/Statistics_6620/Presentations/ml10/whitewines.csv")
# examine the wine data
str(wine)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
## $ volatile.acidity : num 0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
## $ citric.acid : num 0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
## $ residual.sugar : num 1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
## $ chlorides : num 0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
## $ free.sulfur.dioxide : num 6 41 33 11 36 22 33 17 34 40 ...
## $ total.sulfur.dioxide: num 62 113 123 74 119 95 152 67 90 130 ...
## $ density : num 0.993 0.999 0.995 0.991 0.993 ...
## $ pH : num 3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
## $ sulphates : num 0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
## $ alcohol : num 10.4 8.9 10.1 11.2 10.9 ...
## $ quality : int 5 6 6 4 6 6 6 6 6 7 ...
#The wine data include 11 features and the quality outcome
# Compared to other types of machine learning models, one of the advantages of trees is that they can handle many types of data without preprocessing. This means we do not need to normalize or standardize the features.
hist(wine$quality)
# summary statistics of the wine data
summary(wine)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700 1st Qu.: 1.700
## Median : 6.800 Median :0.2600 Median :0.3200 Median : 5.200
## Mean : 6.855 Mean :0.2782 Mean :0.3342 Mean : 6.391
## 3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3900 3rd Qu.: 9.900
## Max. :14.200 Max. :1.1000 Max. :1.6600 Max. :65.800
## chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. :0.00900 Min. : 2.00 Min. : 9.0
## 1st Qu.:0.03600 1st Qu.: 23.00 1st Qu.:108.0
## Median :0.04300 Median : 34.00 Median :134.0
## Mean :0.04577 Mean : 35.31 Mean :138.4
## 3rd Qu.:0.05000 3rd Qu.: 46.00 3rd Qu.:167.0
## Max. :0.34600 Max. :289.00 Max. :440.0
## density pH sulphates alcohol
## Min. :0.9871 Min. :2.720 Min. :0.2200 Min. : 8.00
## 1st Qu.:0.9917 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50
## Median :0.9937 Median :3.180 Median :0.4700 Median :10.40
## Mean :0.9940 Mean :3.188 Mean :0.4898 Mean :10.51
## 3rd Qu.:0.9961 3rd Qu.:3.280 3rd Qu.:0.5500 3rd Qu.:11.40
## Max. :1.0390 Max. :3.820 Max. :1.0800 Max. :14.20
## quality
## Min. :3.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.878
## 3rd Qu.:6.000
## Max. :9.000
#Our last step then is to divide into training and testing datasets. Since the wine data were already sorted into random order, we can partition into two sets of contiguous rows
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]
## Step 3: Training a model on the data ----
# Although almost any implementation of decision trees can be used to perform regression tree modeling, the rpart (recursive partitioning) package offers perhaps the most faithful implementation of regression trees as they were described by the CART team.
library(rpart)
m.rpart <- rpart(quality ~ ., data = wine_train)
# get basic information about the tree
m.rpart
## n= 3750
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 3750 2945.53200 5.870933
## 2) alcohol< 10.85 2372 1418.86100 5.604975
## 4) volatile.acidity>=0.2275 1611 821.30730 5.432030
## 8) volatile.acidity>=0.3025 688 278.97670 5.255814 *
## 9) volatile.acidity< 0.3025 923 505.04230 5.563380 *
## 5) volatile.acidity< 0.2275 761 447.36400 5.971091 *
## 3) alcohol>=10.85 1378 1070.08200 6.328737
## 6) free.sulfur.dioxide< 10.5 84 95.55952 5.369048 *
## 7) free.sulfur.dioxide>=10.5 1294 892.13600 6.391036
## 14) alcohol< 11.76667 629 430.11130 6.173291
## 28) volatile.acidity>=0.465 11 10.72727 4.545455 *
## 29) volatile.acidity< 0.465 618 389.71680 6.202265 *
## 15) alcohol>=11.76667 665 403.99400 6.596992 *
# get more detailed information about the tree
summary(m.rpart)
## Call:
## rpart(formula = quality ~ ., data = wine_train)
## n= 3750
##
## CP nsplit rel error xerror xstd
## 1 0.15501053 0 1.0000000 1.0008763 0.02446762
## 2 0.05098911 1 0.8449895 0.8458386 0.02333416
## 3 0.02796998 2 0.7940004 0.8038789 0.02277807
## 4 0.01970128 3 0.7660304 0.7862819 0.02194886
## 5 0.01265926 4 0.7463291 0.7625430 0.02092908
## 6 0.01007193 5 0.7336698 0.7535891 0.02050616
## 7 0.01000000 6 0.7235979 0.7447673 0.02025170
##
## Variable importance
## alcohol density volatile.acidity
## 34 21 15
## chlorides total.sulfur.dioxide free.sulfur.dioxide
## 11 7 6
## residual.sugar sulphates citric.acid
## 3 1 1
##
## Node number 1: 3750 observations, complexity param=0.1550105
## mean=5.870933, MSE=0.7854751
## left son=2 (2372 obs) right son=3 (1378 obs)
## Primary splits:
## alcohol < 10.85 to the left, improve=0.15501050, (0 missing)
## density < 0.992035 to the right, improve=0.10915940, (0 missing)
## chlorides < 0.0395 to the right, improve=0.07682258, (0 missing)
## total.sulfur.dioxide < 158.5 to the right, improve=0.04089663, (0 missing)
## citric.acid < 0.235 to the left, improve=0.03636458, (0 missing)
## Surrogate splits:
## density < 0.991995 to the right, agree=0.869, adj=0.644, (0 split)
## chlorides < 0.0375 to the right, agree=0.757, adj=0.339, (0 split)
## total.sulfur.dioxide < 103.5 to the right, agree=0.690, adj=0.155, (0 split)
## residual.sugar < 5.375 to the right, agree=0.667, adj=0.094, (0 split)
## sulphates < 0.345 to the right, agree=0.647, adj=0.038, (0 split)
##
## Node number 2: 2372 observations, complexity param=0.05098911
## mean=5.604975, MSE=0.5981709
## left son=4 (1611 obs) right son=5 (761 obs)
## Primary splits:
## volatile.acidity < 0.2275 to the right, improve=0.10585250, (0 missing)
## free.sulfur.dioxide < 13.5 to the left, improve=0.03390500, (0 missing)
## citric.acid < 0.235 to the left, improve=0.03204075, (0 missing)
## alcohol < 10.11667 to the left, improve=0.03136524, (0 missing)
## chlorides < 0.0585 to the right, improve=0.01633599, (0 missing)
## Surrogate splits:
## pH < 3.485 to the left, agree=0.694, adj=0.047, (0 split)
## sulphates < 0.755 to the left, agree=0.685, adj=0.020, (0 split)
## total.sulfur.dioxide < 105.5 to the right, agree=0.683, adj=0.011, (0 split)
## residual.sugar < 0.75 to the right, agree=0.681, adj=0.007, (0 split)
## chlorides < 0.0285 to the right, agree=0.680, adj=0.003, (0 split)
##
## Node number 3: 1378 observations, complexity param=0.02796998
## mean=6.328737, MSE=0.7765472
## left son=6 (84 obs) right son=7 (1294 obs)
## Primary splits:
## free.sulfur.dioxide < 10.5 to the left, improve=0.07699080, (0 missing)
## alcohol < 11.76667 to the left, improve=0.06210660, (0 missing)
## total.sulfur.dioxide < 67.5 to the left, improve=0.04438619, (0 missing)
## residual.sugar < 1.375 to the left, improve=0.02905351, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.02613259, (0 missing)
## Surrogate splits:
## total.sulfur.dioxide < 53.5 to the left, agree=0.952, adj=0.214, (0 split)
## volatile.acidity < 0.875 to the right, agree=0.940, adj=0.024, (0 split)
##
## Node number 4: 1611 observations, complexity param=0.01265926
## mean=5.43203, MSE=0.5098121
## left son=8 (688 obs) right son=9 (923 obs)
## Primary splits:
## volatile.acidity < 0.3025 to the right, improve=0.04540111, (0 missing)
## alcohol < 10.05 to the left, improve=0.03874403, (0 missing)
## free.sulfur.dioxide < 13.5 to the left, improve=0.03338886, (0 missing)
## chlorides < 0.0495 to the right, improve=0.02574623, (0 missing)
## citric.acid < 0.195 to the left, improve=0.02327981, (0 missing)
## Surrogate splits:
## citric.acid < 0.215 to the left, agree=0.633, adj=0.141, (0 split)
## free.sulfur.dioxide < 20.5 to the left, agree=0.600, adj=0.063, (0 split)
## chlorides < 0.0595 to the right, agree=0.593, adj=0.047, (0 split)
## residual.sugar < 1.15 to the left, agree=0.583, adj=0.023, (0 split)
## total.sulfur.dioxide < 219.25 to the right, agree=0.582, adj=0.022, (0 split)
##
## Node number 5: 761 observations
## mean=5.971091, MSE=0.5878633
##
## Node number 6: 84 observations
## mean=5.369048, MSE=1.137613
##
## Node number 7: 1294 observations, complexity param=0.01970128
## mean=6.391036, MSE=0.6894405
## left son=14 (629 obs) right son=15 (665 obs)
## Primary splits:
## alcohol < 11.76667 to the left, improve=0.06504696, (0 missing)
## chlorides < 0.0395 to the right, improve=0.02758705, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.02750932, (0 missing)
## pH < 3.055 to the left, improve=0.02307356, (0 missing)
## total.sulfur.dioxide < 191.5 to the right, improve=0.02186818, (0 missing)
## Surrogate splits:
## density < 0.990885 to the right, agree=0.720, adj=0.424, (0 split)
## volatile.acidity < 0.2675 to the left, agree=0.637, adj=0.253, (0 split)
## chlorides < 0.0365 to the right, agree=0.630, adj=0.238, (0 split)
## residual.sugar < 1.475 to the left, agree=0.575, adj=0.126, (0 split)
## total.sulfur.dioxide < 128.5 to the right, agree=0.574, adj=0.124, (0 split)
##
## Node number 8: 688 observations
## mean=5.255814, MSE=0.4054895
##
## Node number 9: 923 observations
## mean=5.56338, MSE=0.5471747
##
## Node number 14: 629 observations, complexity param=0.01007193
## mean=6.173291, MSE=0.6838017
## left son=28 (11 obs) right son=29 (618 obs)
## Primary splits:
## volatile.acidity < 0.465 to the right, improve=0.06897561, (0 missing)
## total.sulfur.dioxide < 200 to the right, improve=0.04223066, (0 missing)
## residual.sugar < 0.975 to the left, improve=0.03061714, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.02978501, (0 missing)
## sulphates < 0.575 to the left, improve=0.02165970, (0 missing)
## Surrogate splits:
## citric.acid < 0.045 to the left, agree=0.986, adj=0.182, (0 split)
## total.sulfur.dioxide < 279.25 to the right, agree=0.986, adj=0.182, (0 split)
##
## Node number 15: 665 observations
## mean=6.596992, MSE=0.6075098
##
## Node number 28: 11 observations
## mean=4.545455, MSE=0.9752066
##
## Node number 29: 618 observations
## mean=6.202265, MSE=0.6306098
#Nodes indicated by * are terminal or leaf nodes, which means that they result in a prediction.
# use the rpart.plot package to create a visualization
library(rpart.plot)
# a basic decision tree diagram
rpart.plot(m.rpart, digits = 3)
# a few adjustments to the diagram
rpart.plot(m.rpart, digits = 4, fallen.leaves = TRUE, type = 3, extra = 101)
## Step 4: Evaluate model performance ----
# To use the regression tree model to make predictions on the test data, we use the predict() function. By default, this returns the estimated numeric value for the outcome variable
p.rpart <- predict(m.rpart, wine_test)
# compare the distribution of predicted values vs. actual values
summary(p.rpart)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.545 5.563 5.971 5.893 6.202 6.597
summary(wine_test$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.901 6.000 9.000
# compare the correlation
cor(p.rpart, wine_test$quality)
## [1] 0.5369525
#A correlation of 0.54 is certainly acceptable. However, the correlation only measures how strongly the predictions are related to the true value; it is not a measure of how far off the predictions were from the true values.
#Another way to think about the model's performance is to consider how far, on average, its prediction was from the true value. This measurement is called the mean absolute error (MAE).
# function to calculate the mean absolute error
MAE <- function(actual, predicted) {
mean(abs(actual - predicted))
}
# mean absolute error between predicted and actual values
MAE(p.rpart, wine_test$quality)
## [1] 0.5872652
# mean absolute error between actual values and mean value
mean(wine_train$quality) # result = 5.87
## [1] 5.870933
MAE(5.87, wine_test$quality)
## [1] 0.6722474
## Step 5: Improving model performance ----
#To improve the performance of our learner, let's try to build a model tree. Recall that a model tree improves on regression trees by replacing the leaf nodes with regression models. This often results in more accurate results than regression trees, which use only a single value for prediction at the leaf nodes.
# train a M5' Model Tree
library(RWeka)
m.m5p <- M5P(quality ~ ., data = wine_train)
# display the tree
m.m5p
## M5 pruned model tree:
## (using smoothed linear models)
##
## alcohol <= 10.85 :
## | volatile.acidity <= 0.237 :
## | | fixed.acidity <= 6.85 : LM1 (406/66.024%)
## | | fixed.acidity > 6.85 :
## | | | free.sulfur.dioxide <= 24.5 : LM2 (113/87.697%)
## | | | free.sulfur.dioxide > 24.5 :
## | | | | alcohol <= 9.15 :
## | | | | | citric.acid <= 0.305 :
## | | | | | | residual.sugar <= 14.45 :
## | | | | | | | residual.sugar <= 13.8 :
## | | | | | | | | chlorides <= 0.052 : LM3 (6/77.537%)
## | | | | | | | | chlorides > 0.052 : LM4 (13/0%)
## | | | | | | | residual.sugar > 13.8 : LM5 (11/0%)
## | | | | | | residual.sugar > 14.45 : LM6 (12/0%)
## | | | | | citric.acid > 0.305 :
## | | | | | | total.sulfur.dioxide <= 169.5 :
## | | | | | | | total.sulfur.dioxide <= 161.5 :
## | | | | | | | | pH <= 3.355 :
## | | | | | | | | | volatile.acidity <= 0.215 :
## | | | | | | | | | | free.sulfur.dioxide <= 44 : LM7 (3/53.19%)
## | | | | | | | | | | free.sulfur.dioxide > 44 : LM8 (8/48.858%)
## | | | | | | | | | volatile.acidity > 0.215 : LM9 (3/0%)
## | | | | | | | | pH > 3.355 : LM10 (4/0%)
## | | | | | | | total.sulfur.dioxide > 161.5 : LM11 (6/0%)
## | | | | | | total.sulfur.dioxide > 169.5 :
## | | | | | | | sulphates <= 0.56 :
## | | | | | | | | free.sulfur.dioxide <= 48.5 : LM12 (7/0%)
## | | | | | | | | free.sulfur.dioxide > 48.5 :
## | | | | | | | | | fixed.acidity <= 7.3 : LM13 (5/0%)
## | | | | | | | | | fixed.acidity > 7.3 : LM14 (4/0%)
## | | | | | | | sulphates > 0.56 : LM15 (11/0%)
## | | | | alcohol > 9.15 :
## | | | | | density <= 0.996 :
## | | | | | | sulphates <= 0.395 : LM16 (38/85.791%)
## | | | | | | sulphates > 0.395 : LM17 (120/71.353%)
## | | | | | density > 0.996 :
## | | | | | | residual.sugar <= 14.7 : LM18 (84/45.874%)
## | | | | | | residual.sugar > 14.7 : LM19 (24/62.764%)
## | volatile.acidity > 0.237 :
## | | alcohol <= 10.15 :
## | | | volatile.acidity <= 0.302 :
## | | | | citric.acid <= 0.265 :
## | | | | | free.sulfur.dioxide <= 25.5 : LM20 (39/41.77%)
## | | | | | free.sulfur.dioxide > 25.5 : LM21 (131/61.681%)
## | | | | citric.acid > 0.265 :
## | | | | | citric.acid <= 0.395 : LM22 (213/72.749%)
## | | | | | citric.acid > 0.395 : LM23 (189/62.097%)
## | | | volatile.acidity > 0.302 : LM24 (552/64.09%)
## | | alcohol > 10.15 :
## | | | free.sulfur.dioxide <= 26.5 : LM25 (151/75.998%)
## | | | free.sulfur.dioxide > 26.5 :
## | | | | total.sulfur.dioxide <= 161.5 : LM26 (142/74.4%)
## | | | | total.sulfur.dioxide > 161.5 : LM27 (77/77.736%)
## alcohol > 10.85 :
## | alcohol <= 11.767 :
## | | free.sulfur.dioxide <= 21.5 :
## | | | free.sulfur.dioxide <= 11.5 :
## | | | | density <= 0.992 : LM28 (19/84.403%)
## | | | | density > 0.992 :
## | | | | | fixed.acidity <= 6.85 : LM29 (6/108.029%)
## | | | | | fixed.acidity > 6.85 : LM30 (21/69.935%)
## | | | free.sulfur.dioxide > 11.5 :
## | | | | volatile.acidity <= 0.195 : LM31 (36/61.98%)
## | | | | volatile.acidity > 0.195 :
## | | | | | chlorides <= 0.036 : LM32 (34/115.199%)
## | | | | | chlorides > 0.036 : LM33 (59/78.207%)
## | | free.sulfur.dioxide > 21.5 : LM34 (495/84.229%)
## | alcohol > 11.767 :
## | | free.sulfur.dioxide <= 21.5 : LM35 (181/88.599%)
## | | free.sulfur.dioxide > 21.5 : LM36 (527/81.837%)
##
## LM num: 1
## quality =
## 0.266 * fixed.acidity
## - 2.3082 * volatile.acidity
## - 0.012 * citric.acid
## + 0.0421 * residual.sugar
## + 0.1126 * chlorides
## + 0 * free.sulfur.dioxide
## - 0.0015 * total.sulfur.dioxide
## - 109.8813 * density
## + 0.035 * pH
## + 1.4122 * sulphates
## - 0.0046 * alcohol
## + 113.1021
##
## LM num: 2
## quality =
## -0.2557 * fixed.acidity
## - 0.8082 * volatile.acidity
## - 0.1062 * citric.acid
## + 0.0738 * residual.sugar
## + 0.0973 * chlorides
## + 0.0006 * free.sulfur.dioxide
## + 0.0003 * total.sulfur.dioxide
## - 210.1018 * density
## + 0.0323 * pH
## - 0.9604 * sulphates
## - 0.0231 * alcohol
## + 216.8857
##
## LM num: 3
## quality =
## 0.0725 * fixed.acidity
## - 1.0921 * volatile.acidity
## - 0.6118 * citric.acid
## + 0.0294 * residual.sugar
## + 105.3735 * chlorides
## - 0.0027 * free.sulfur.dioxide
## + 0.0001 * total.sulfur.dioxide
## - 31.5856 * density
## + 0.0323 * pH
## + 0.1199 * sulphates
## - 0.0373 * alcohol
## + 32.2345
##
## LM num: 4
## quality =
## 0.0725 * fixed.acidity
## - 1.0921 * volatile.acidity
## - 0.6118 * citric.acid
## + 0.0294 * residual.sugar
## + 99.4295 * chlorides
## - 0.0027 * free.sulfur.dioxide
## + 0.0001 * total.sulfur.dioxide
## - 31.5856 * density
## + 0.0323 * pH
## + 0.1199 * sulphates
## - 0.0373 * alcohol
## + 32.6786
##
## LM num: 5
## quality =
## 0.0944 * fixed.acidity
## - 1.0921 * volatile.acidity
## - 0.6118 * citric.acid
## + 0.0255 * residual.sugar
## + 95.8527 * chlorides
## - 0.0027 * free.sulfur.dioxide
## + 0.0001 * total.sulfur.dioxide
## - 31.5856 * density
## + 0.0323 * pH
## + 0.1199 * sulphates
## - 0.0373 * alcohol
## + 32.9544
##
## LM num: 6
## quality =
## 0.0012 * fixed.acidity
## - 1.0921 * volatile.acidity
## - 0.6118 * citric.acid
## + 0.0491 * residual.sugar
## + 54.3184 * chlorides
## - 0.0027 * free.sulfur.dioxide
## + 0.0001 * total.sulfur.dioxide
## - 31.5856 * density
## + 0.0323 * pH
## + 0.1199 * sulphates
## - 0.0373 * alcohol
## + 35.4429
##
## LM num: 7
## quality =
## 0.0012 * fixed.acidity
## - 2.7131 * volatile.acidity
## - 1.0049 * citric.acid
## + 0.0297 * residual.sugar
## + 5.7935 * chlorides
## - 0.0147 * free.sulfur.dioxide
## + 0.0001 * total.sulfur.dioxide
## - 31.5856 * density
## + 0.633 * pH
## + 0.1199 * sulphates
## - 0.0373 * alcohol
## + 36.9235
##
## LM num: 8
## quality =
## 0.0012 * fixed.acidity
## - 2.7131 * volatile.acidity
## - 1.0049 * citric.acid
## + 0.0297 * residual.sugar
## + 5.7935 * chlorides
## - 0.0141 * free.sulfur.dioxide
## + 0.0001 * total.sulfur.dioxide
## - 31.5856 * density
## + 0.633 * pH
## + 0.1199 * sulphates
## - 0.0373 * alcohol
## + 36.8808
##
## LM num: 9
## quality =
## 0.0012 * fixed.acidity
## - 3.4336 * volatile.acidity
## - 1.0049 * citric.acid
## + 0.0297 * residual.sugar
## + 5.7935 * chlorides
## - 0.0146 * free.sulfur.dioxide
## + 0.0001 * total.sulfur.dioxide
## - 31.5856 * density
## + 0.633 * pH
## + 0.1199 * sulphates
## - 0.0373 * alcohol
## + 37.0118
##
## LM num: 10
## quality =
## 0.0012 * fixed.acidity
## - 1.0921 * volatile.acidity
## - 1.0049 * citric.acid
## + 0.0297 * residual.sugar
## + 5.7935 * chlorides
## - 0.0065 * free.sulfur.dioxide
## + 0.0001 * total.sulfur.dioxide
## - 31.5856 * density
## + 0.8211 * pH
## + 0.1199 * sulphates
## - 0.0373 * alcohol
## + 35.686
##
## LM num: 11
## quality =
## 0.0012 * fixed.acidity
## - 1.0921 * volatile.acidity
## - 1.0049 * citric.acid
## + 0.0297 * residual.sugar
## + 5.7935 * chlorides
## - 0.0065 * free.sulfur.dioxide
## + 0.0001 * total.sulfur.dioxide
## - 31.5856 * density
## + 0.2757 * pH
## + 0.1199 * sulphates
## - 0.0373 * alcohol
## + 37.5168
##
## LM num: 12
## quality =
## -0.0571 * fixed.acidity
## - 1.0921 * volatile.acidity
## - 1.534 * citric.acid
## + 0.0297 * residual.sugar
## + 5.7935 * chlorides
## - 0.0098 * free.sulfur.dioxide
## + 0.0001 * total.sulfur.dioxide
## - 31.5856 * density
## + 0.2583 * pH
## + 0.3345 * sulphates
## - 0.0373 * alcohol
## + 38.0548
##
## LM num: 13
## quality =
## -0.304 * fixed.acidity
## - 1.0921 * volatile.acidity
## + 0.3698 * citric.acid
## + 0.0297 * residual.sugar
## + 5.7935 * chlorides
## - 0.0097 * free.sulfur.dioxide
## + 0.0001 * total.sulfur.dioxide
## - 31.5856 * density
## + 0.2583 * pH
## + 0.3345 * sulphates
## - 0.0373 * alcohol
## + 39.1208
##
## LM num: 14
## quality =
## -0.317 * fixed.acidity
## - 1.0921 * volatile.acidity
## - 1.5116 * citric.acid
## + 0.0297 * residual.sugar
## + 5.7935 * chlorides
## - 0.0097 * free.sulfur.dioxide
## + 0.0001 * total.sulfur.dioxide
## - 31.5856 * density
## + 0.2583 * pH
## + 0.3345 * sulphates
## - 0.0373 * alcohol
## + 39.9144
##
## LM num: 15
## quality =
## -0.0683 * fixed.acidity
## - 1.0921 * volatile.acidity
## - 1.3217 * citric.acid
## + 0.0297 * residual.sugar
## + 5.7935 * chlorides
## - 0.0088 * free.sulfur.dioxide
## + 0.0001 * total.sulfur.dioxide
## - 31.5856 * density
## + 0.2583 * pH
## + 0.3758 * sulphates
## - 0.0373 * alcohol
## + 37.9875
##
## LM num: 16
## quality =
## -0.4138 * fixed.acidity
## - 2.4188 * volatile.acidity
## - 0.1001 * citric.acid
## + 0.0519 * residual.sugar
## + 1.2445 * chlorides
## + 0.0002 * free.sulfur.dioxide
## + 0.0001 * total.sulfur.dioxide
## + 146.7811 * density
## + 0.5635 * pH
## + 0.3884 * sulphates
## + 0.7403 * alcohol
## - 145.8266
##
## LM num: 17
## quality =
## 0.2744 * fixed.acidity
## - 3.6766 * volatile.acidity
## - 0.1001 * citric.acid
## + 0.0846 * residual.sugar
## + 0.5477 * chlorides
## + 0.0002 * free.sulfur.dioxide
## + 0.0001 * total.sulfur.dioxide
## - 239.7241 * density
## + 1.5648 * pH
## + 0.8289 * sulphates
## - 0.0207 * alcohol
## + 237.4198
##
## LM num: 18
## quality =
## 0.0178 * fixed.acidity
## - 1.19 * volatile.acidity
## - 0.1001 * citric.acid
## + 0.041 * residual.sugar
## + 0.0973 * chlorides
## + 0.0002 * free.sulfur.dioxide
## + 0.0001 * total.sulfur.dioxide
## - 46.7151 * density
## + 0.1443 * pH
## + 0.1669 * sulphates
## - 0.0207 * alcohol
## + 51.8086
##
## LM num: 19
## quality =
## 0.0178 * fixed.acidity
## - 11.6553 * volatile.acidity
## - 0.1001 * citric.acid
## + 0.0199 * residual.sugar
## + 0.0973 * chlorides
## + 0.0002 * free.sulfur.dioxide
## + 0.0044 * total.sulfur.dioxide
## - 46.7151 * density
## + 2.2855 * pH
## + 0.1669 * sulphates
## - 0.0207 * alcohol
## + 46.4726
##
## LM num: 20
## quality =
## -0.0389 * fixed.acidity
## - 0.2704 * volatile.acidity
## + 0.6445 * citric.acid
## + 0.0043 * residual.sugar
## - 11.7525 * chlorides
## + 0.0148 * free.sulfur.dioxide
## + 13.1536 * density
## - 0.2235 * pH
## + 0.0154 * sulphates
## + 0.1335 * alcohol
## - 8.119
##
## LM num: 21
## quality =
## -0.0139 * fixed.acidity
## - 0.2704 * volatile.acidity
## + 2.7911 * citric.acid
## + 0.0043 * residual.sugar
## - 16.394 * chlorides
## - 0.0093 * free.sulfur.dioxide
## - 0.0028 * total.sulfur.dioxide
## + 2.2255 * density
## - 0.088 * pH
## + 0.0154 * sulphates
## + 0.285 * alcohol
## + 1.9775
##
## LM num: 22
## quality =
## 0.0008 * fixed.acidity
## - 3.3571 * volatile.acidity
## - 0.0474 * citric.acid
## + 0.0167 * residual.sugar
## + 0.0002 * free.sulfur.dioxide
## - 0.0001 * total.sulfur.dioxide
## - 2.6496 * density
## + 0.0071 * pH
## + 0.0154 * sulphates
## + 0.0295 * alcohol
## + 8.7127
##
## LM num: 23
## quality =
## 0.0008 * fixed.acidity
## - 0.1682 * volatile.acidity
## - 0.0533 * citric.acid
## + 0.0034 * residual.sugar
## + 0.0112 * free.sulfur.dioxide
## - 0.004 * total.sulfur.dioxide
## - 2.4685 * density
## + 0.0071 * pH
## + 0.0154 * sulphates
## + 0.3099 * alcohol
## + 5.1272
##
## LM num: 24
## quality =
## -0.1011 * fixed.acidity
## - 0.8767 * volatile.acidity
## + 0.0025 * citric.acid
## + 0.0183 * residual.sugar
## - 1.5815 * chlorides
## + 0 * free.sulfur.dioxide
## + 0.0015 * total.sulfur.dioxide
## - 4.1889 * density
## + 0.0195 * pH
## + 0.0154 * sulphates
## + 0.2656 * alcohol
## + 7.556
##
## LM num: 25
## quality =
## 0.1885 * fixed.acidity
## - 1.6681 * volatile.acidity
## + 0.0075 * citric.acid
## + 0.1434 * residual.sugar
## + 0.0181 * free.sulfur.dioxide
## - 438.9263 * density
## + 1.5263 * pH
## + 1.5041 * sulphates
## + 0.0067 * alcohol
## + 434.1083
##
## LM num: 26
## quality =
## 0.3156 * fixed.acidity
## - 0.3103 * volatile.acidity
## + 0.0075 * citric.acid
## + 0.0769 * residual.sugar
## + 0.0157 * free.sulfur.dioxide
## - 0.0006 * total.sulfur.dioxide
## - 224.3886 * density
## + 2.8971 * pH
## + 1.4123 * sulphates
## + 0.0067 * alcohol
## + 215.8849
##
## LM num: 27
## quality =
## 0.0704 * fixed.acidity
## - 1.6931 * volatile.acidity
## + 0.0075 * citric.acid
## + 0.0268 * residual.sugar
## + 0 * free.sulfur.dioxide
## - 0.0058 * total.sulfur.dioxide
## - 69.0546 * density
## + 0.5221 * pH
## + 0.3033 * sulphates
## + 0.0067 * alcohol
## + 73.2245
##
## LM num: 28
## quality =
## -0.0359 * fixed.acidity
## - 2.1355 * volatile.acidity
## + 0.0312 * residual.sugar
## - 0.7007 * chlorides
## + 0.0139 * free.sulfur.dioxide
## - 3.9257 * density
## + 0.1002 * pH
## + 0.0883 * sulphates
## + 0.0057 * alcohol
## + 9.0802
##
## LM num: 29
## quality =
## -0.1622 * fixed.acidity
## - 1.936 * volatile.acidity
## + 0.0312 * residual.sugar
## - 0.7007 * chlorides
## + 0.0139 * free.sulfur.dioxide
## - 8.2054 * density
## + 0.5998 * pH
## + 0.0883 * sulphates
## + 0.0057 * alcohol
## + 13.1705
##
## LM num: 30
## quality =
## -0.1095 * fixed.acidity
## - 1.936 * volatile.acidity
## + 0.0312 * residual.sugar
## - 0.7007 * chlorides
## + 0.0139 * free.sulfur.dioxide
## - 8.2054 * density
## + 0.8708 * pH
## + 0.0883 * sulphates
## + 0.0057 * alcohol
## + 11.7475
##
## LM num: 31
## quality =
## -0.2583 * fixed.acidity
## - 1.4215 * volatile.acidity
## - 1.371 * citric.acid
## + 0.0305 * residual.sugar
## - 3.2137 * chlorides
## + 0.0063 * free.sulfur.dioxide
## - 18.7292 * density
## + 0.1002 * pH
## + 0.0883 * sulphates
## + 0.1232 * alcohol
## + 25.7445
##
## LM num: 32
## quality =
## -0.0968 * fixed.acidity
## - 0.9855 * volatile.acidity
## + 0.0245 * residual.sugar
## - 4.6936 * chlorides
## + 0.0063 * free.sulfur.dioxide
## - 18.7292 * density
## - 0.2017 * pH
## + 0.0883 * sulphates
## + 0.0612 * alcohol
## + 25.5306
##
## LM num: 33
## quality =
## -0.0764 * fixed.acidity
## - 0.9855 * volatile.acidity
## + 0.0461 * residual.sugar
## - 3.7456 * chlorides
## + 0.0063 * free.sulfur.dioxide
## - 18.7292 * density
## - 0.0997 * pH
## + 0.0883 * sulphates
## + 0.4563 * alcohol
## + 20.1476
##
## LM num: 34
## quality =
## 0.0026 * fixed.acidity
## - 1.5467 * volatile.acidity
## + 0.5902 * citric.acid
## + 0.0796 * residual.sugar
## - 7.6293 * chlorides
## + 0.0004 * free.sulfur.dioxide
## - 0.002 * total.sulfur.dioxide
## - 105.9188 * density
## + 0.9409 * pH
## + 1.1632 * sulphates
## + 0.0057 * alcohol
## + 108.0478
##
## LM num: 35
## quality =
## 0.1974 * fixed.acidity
## - 1.5244 * volatile.acidity
## - 1.1342 * citric.acid
## + 0.1108 * residual.sugar
## - 0.5309 * chlorides
## + 0.0345 * free.sulfur.dioxide
## + 0.0002 * total.sulfur.dioxide
## - 306.9205 * density
## + 1.162 * pH
## + 0.0755 * sulphates
## - 0.0054 * alcohol
## + 305.176
##
## LM num: 36
## quality =
## 0.2738 * fixed.acidity
## - 0.0442 * volatile.acidity
## + 0.1664 * residual.sugar
## - 7.6486 * chlorides
## + 0.0005 * free.sulfur.dioxide
## + 0.0001 * total.sulfur.dioxide
## - 350.199 * density
## + 1.7781 * pH
## + 1.0583 * sulphates
## - 0.1722 * alcohol
## + 347.3058
##
## Number of Rules : 36
# get a summary of the model's performance
summary(m.m5p)
##
## === Summary ===
##
## Correlation coefficient 0.6666
## Mean absolute error 0.5151
## Root mean squared error 0.6614
## Relative absolute error 76.4921 %
## Root relative squared error 74.6259 %
## Total Number of Instances 3750
# generate predictions for the model
p.m5p <- predict(m.m5p, wine_test)
# summary statistics about the predictions
summary(p.m5p)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.389 5.430 5.863 5.874 6.305 7.437
# correlation between the predicted and true values
cor(p.m5p, wine_test$quality)
## [1] 0.6272973
# mean absolute error of predicted and true values
# (uses a custom function defined above)
MAE(wine_test$quality, p.m5p)
## [1] 0.5463023
The correlation also seems to be substantially higher and the model slightly improved the mean absolute error.