This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
wine <- read.csv("winequality-red.csv")
str(wine)
## 'data.frame': 1599 obs. of 12 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
hist(wine$quality)
summary(wine)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956
## Median :0.07900 Median :14.00 Median : 38.00 Median :0.9968
## Mean :0.08747 Mean :15.87 Mean : 46.47 Mean :0.9967
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :72.00 Max. :289.00 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.740 Min. :0.3300 Min. : 8.40 Min. :3.000
## 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.310 Median :0.6200 Median :10.20 Median :6.000
## Mean :3.311 Mean :0.6581 Mean :10.42 Mean :5.636
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :8.000
wine_train <- wine[1:1300, ]
wine_test <- wine[1301:1599, ]
library(rpart)
m.rpart <- rpart(quality ~ ., data = wine_train)
m.rpart
## n= 1300
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 1300 859.72000 5.660000
## 2) alcohol< 10.55 813 336.49690 5.384994
## 4) sulphates< 0.575 307 94.52769 5.166124 *
## 5) sulphates>=0.575 506 218.33990 5.517787
## 10) volatile.acidity>=0.405 377 142.90720 5.405836
## 20) total.sulfur.dioxide>=46.5 176 50.35795 5.221591 *
## 21) total.sulfur.dioxide< 46.5 201 81.34328 5.567164 *
## 11) volatile.acidity< 0.405 129 56.89922 5.844961 *
## 3) alcohol>=10.55 487 359.09240 6.119097
## 6) volatile.acidity>=0.87 21 16.95238 4.619048 *
## 7) volatile.acidity< 0.87 466 292.75750 6.186695
## 14) sulphates< 0.625 173 102.78610 5.843931
## 28) volatile.acidity>=0.385 119 66.55462 5.663866
## 56) free.sulfur.dioxide< 8.5 43 31.06977 5.302326 *
## 57) free.sulfur.dioxide>=8.5 76 26.68421 5.868421 *
## 29) volatile.acidity< 0.385 54 23.87037 6.240741 *
## 15) sulphates>=0.625 293 157.64510 6.389078
## 30) alcohol< 11.55 161 75.11801 6.155280 *
## 31) alcohol>=11.55 132 62.99242 6.674242 *
summary(m.rpart)
## Call:
## rpart(formula = quality ~ ., data = wine_train)
## n= 1300
##
## CP nsplit rel error xerror xstd
## 1 0.19091178 0 1.0000000 1.0014977 0.04024323
## 2 0.05744023 1 0.8090882 0.8583681 0.04074097
## 3 0.03760100 2 0.7516480 0.8050408 0.03593837
## 4 0.02748490 3 0.7140470 0.7824729 0.03465799
## 5 0.02272207 4 0.6865621 0.7332420 0.03214081
## 6 0.02155764 5 0.6638400 0.7217821 0.03169172
## 7 0.01437809 6 0.6422824 0.7080211 0.03132872
## 8 0.01303439 7 0.6279043 0.7058365 0.03098298
## 9 0.01023664 8 0.6148699 0.6899567 0.03077058
## 10 0.01000000 9 0.6046333 0.6837573 0.03058254
##
## Variable importance
## alcohol volatile.acidity density
## 33 19 14
## sulphates total.sulfur.dioxide chlorides
## 10 6 6
## fixed.acidity free.sulfur.dioxide citric.acid
## 5 3 3
## pH residual.sugar
## 1 1
##
## Node number 1: 1300 observations, complexity param=0.1909118
## mean=5.66, MSE=0.6613231
## left son=2 (813 obs) right son=3 (487 obs)
## Primary splits:
## alcohol < 10.55 to the left, improve=0.19091180, (0 missing)
## volatile.acidity < 0.425 to the right, improve=0.11695820, (0 missing)
## sulphates < 0.645 to the left, improve=0.11405640, (0 missing)
## density < 0.995565 to the right, improve=0.07884340, (0 missing)
## citric.acid < 0.305 to the left, improve=0.07230016, (0 missing)
## Surrogate splits:
## density < 0.995575 to the right, agree=0.775, adj=0.400, (0 split)
## chlorides < 0.0685 to the right, agree=0.688, adj=0.166, (0 split)
## volatile.acidity < 0.3675 to the right, agree=0.671, adj=0.121, (0 split)
## total.sulfur.dioxide < 12.5 to the right, agree=0.658, adj=0.088, (0 split)
## fixed.acidity < 6.55 to the right, agree=0.653, adj=0.074, (0 split)
##
## Node number 2: 813 observations, complexity param=0.0274849
## mean=5.384994, MSE=0.4138954
## left son=4 (307 obs) right son=5 (506 obs)
## Primary splits:
## sulphates < 0.575 to the left, improve=0.07022149, (0 missing)
## volatile.acidity < 0.325 to the right, improve=0.06725766, (0 missing)
## alcohol < 9.85 to the left, improve=0.06644140, (0 missing)
## total.sulfur.dioxide < 83.5 to the right, improve=0.03974084, (0 missing)
## fixed.acidity < 12.55 to the left, improve=0.03453744, (0 missing)
## Surrogate splits:
## density < 0.996285 to the left, agree=0.669, adj=0.124, (0 split)
## volatile.acidity < 0.7975 to the right, agree=0.648, adj=0.068, (0 split)
## citric.acid < 0.105 to the left, agree=0.635, adj=0.033, (0 split)
## fixed.acidity < 6.15 to the left, agree=0.632, adj=0.026, (0 split)
## chlorides < 0.055 to the left, agree=0.629, adj=0.016, (0 split)
##
## Node number 3: 487 observations, complexity param=0.05744023
## mean=6.119097, MSE=0.7373561
## left son=6 (21 obs) right son=7 (466 obs)
## Primary splits:
## volatile.acidity < 0.87 to the right, improve=0.13752030, (0 missing)
## sulphates < 0.625 to the left, improve=0.13402640, (0 missing)
## alcohol < 11.55 to the left, improve=0.11749380, (0 missing)
## citric.acid < 0.295 to the left, improve=0.11086060, (0 missing)
## pH < 3.355 to the right, improve=0.06231254, (0 missing)
##
## Node number 4: 307 observations
## mean=5.166124, MSE=0.3079078
##
## Node number 5: 506 observations, complexity param=0.02155764
## mean=5.517787, MSE=0.4315018
## left son=10 (377 obs) right son=11 (129 obs)
## Primary splits:
## volatile.acidity < 0.405 to the right, improve=0.08488386, (0 missing)
## total.sulfur.dioxide < 82.5 to the right, improve=0.07334348, (0 missing)
## fixed.acidity < 10.95 to the left, improve=0.06416595, (0 missing)
## alcohol < 9.85 to the left, improve=0.06023478, (0 missing)
## chlorides < 0.0975 to the right, improve=0.03533756, (0 missing)
## Surrogate splits:
## fixed.acidity < 10.45 to the left, agree=0.783, adj=0.147, (0 split)
## citric.acid < 0.365 to the left, agree=0.757, adj=0.047, (0 split)
## chlorides < 0.0595 to the right, agree=0.757, adj=0.047, (0 split)
## free.sulfur.dioxide < 2.5 to the right, agree=0.753, adj=0.031, (0 split)
##
## Node number 6: 21 observations
## mean=4.619048, MSE=0.8072562
##
## Node number 7: 466 observations, complexity param=0.037601
## mean=6.186695, MSE=0.628235
## left son=14 (173 obs) right son=15 (293 obs)
## Primary splits:
## sulphates < 0.625 to the left, improve=0.11042020, (0 missing)
## alcohol < 11.55 to the left, improve=0.09550440, (0 missing)
## volatile.acidity < 0.445 to the right, improve=0.08224086, (0 missing)
## citric.acid < 0.335 to the left, improve=0.07518739, (0 missing)
## fixed.acidity < 7.85 to the left, improve=0.03543857, (0 missing)
## Surrogate splits:
## citric.acid < 0.205 to the left, agree=0.704, adj=0.202, (0 split)
## fixed.acidity < 7.15 to the left, agree=0.697, adj=0.185, (0 split)
## volatile.acidity < 0.665 to the right, agree=0.676, adj=0.127, (0 split)
## total.sulfur.dioxide < 12.5 to the left, agree=0.670, adj=0.110, (0 split)
## density < 0.99351 to the left, agree=0.663, adj=0.092, (0 split)
##
## Node number 10: 377 observations, complexity param=0.01303439
## mean=5.405836, MSE=0.3790641
## left son=20 (176 obs) right son=21 (201 obs)
## Primary splits:
## total.sulfur.dioxide < 46.5 to the right, improve=0.07841401, (0 missing)
## alcohol < 9.85 to the left, improve=0.05783720, (0 missing)
## free.sulfur.dioxide < 26.5 to the right, improve=0.03637224, (0 missing)
## fixed.acidity < 11 to the left, improve=0.03277869, (0 missing)
## chlorides < 0.0975 to the right, improve=0.02813017, (0 missing)
## Surrogate splits:
## free.sulfur.dioxide < 14.5 to the right, agree=0.804, adj=0.580, (0 split)
## residual.sugar < 2.55 to the right, agree=0.637, adj=0.222, (0 split)
## citric.acid < 0.145 to the right, agree=0.602, adj=0.148, (0 split)
## chlorides < 0.0975 to the right, agree=0.602, adj=0.148, (0 split)
## pH < 3.235 to the left, agree=0.602, adj=0.148, (0 split)
##
## Node number 11: 129 observations
## mean=5.844961, MSE=0.4410793
##
## Node number 14: 173 observations, complexity param=0.01437809
## mean=5.843931, MSE=0.5941395
## left son=28 (119 obs) right son=29 (54 obs)
## Primary splits:
## volatile.acidity < 0.385 to the right, improve=0.12026070, (0 missing)
## citric.acid < 0.255 to the left, improve=0.09756330, (0 missing)
## alcohol < 11.05 to the left, improve=0.09282229, (0 missing)
## pH < 3.265 to the right, improve=0.07266816, (0 missing)
## density < 0.995565 to the right, improve=0.06217593, (0 missing)
## Surrogate splits:
## citric.acid < 0.255 to the left, agree=0.803, adj=0.370, (0 split)
## pH < 3.275 to the right, agree=0.763, adj=0.241, (0 split)
## density < 0.99156 to the right, agree=0.723, adj=0.111, (0 split)
## fixed.acidity < 7.85 to the left, agree=0.717, adj=0.093, (0 split)
## free.sulfur.dioxide < 35 to the left, agree=0.717, adj=0.093, (0 split)
##
## Node number 15: 293 observations, complexity param=0.02272207
## mean=6.389078, MSE=0.5380377
## left son=30 (161 obs) right son=31 (132 obs)
## Primary splits:
## alcohol < 11.55 to the left, improve=0.12391520, (0 missing)
## total.sulfur.dioxide < 96 to the right, improve=0.06564923, (0 missing)
## volatile.acidity < 0.335 to the right, improve=0.05428798, (0 missing)
## chlorides < 0.0785 to the right, improve=0.04876986, (0 missing)
## density < 0.99359 to the right, improve=0.04346662, (0 missing)
## Surrogate splits:
## density < 0.995315 to the right, agree=0.700, adj=0.333, (0 split)
## fixed.acidity < 5.75 to the right, agree=0.611, adj=0.136, (0 split)
## chlorides < 0.0565 to the right, agree=0.611, adj=0.136, (0 split)
## pH < 3.475 to the left, agree=0.601, adj=0.114, (0 split)
## residual.sugar < 4.25 to the left, agree=0.594, adj=0.098, (0 split)
##
## Node number 20: 176 observations
## mean=5.221591, MSE=0.2861247
##
## Node number 21: 201 observations
## mean=5.567164, MSE=0.404693
##
## Node number 28: 119 observations, complexity param=0.01023664
## mean=5.663866, MSE=0.5592825
## left son=56 (43 obs) right son=57 (76 obs)
## Primary splits:
## free.sulfur.dioxide < 8.5 to the left, improve=0.13223190, (0 missing)
## total.sulfur.dioxide < 13.5 to the left, improve=0.06204122, (0 missing)
## sulphates < 0.585 to the left, improve=0.05548373, (0 missing)
## alcohol < 11.45 to the left, improve=0.05304082, (0 missing)
## volatile.acidity < 0.595 to the right, improve=0.04312165, (0 missing)
## Surrogate splits:
## total.sulfur.dioxide < 15.5 to the left, agree=0.899, adj=0.721, (0 split)
## fixed.acidity < 7.9 to the right, agree=0.681, adj=0.116, (0 split)
## density < 0.99665 to the right, agree=0.672, adj=0.093, (0 split)
## citric.acid < 0.285 to the right, agree=0.664, adj=0.070, (0 split)
## sulphates < 0.395 to the left, agree=0.664, adj=0.070, (0 split)
##
## Node number 29: 54 observations
## mean=6.240741, MSE=0.4420439
##
## Node number 30: 161 observations
## mean=6.15528, MSE=0.4665715
##
## Node number 31: 132 observations
## mean=6.674242, MSE=0.4772153
##
## Node number 56: 43 observations
## mean=5.302326, MSE=0.7225527
##
## Node number 57: 76 observations
## mean=5.868421, MSE=0.351108
#install.packages("rpart.plot")
library(rpart.plot)
rpart.plot(m.rpart, digits = 3)
rpart.plot(m.rpart, digits = 4, fallen.leaves = TRUE, type = 3, extra = 101)
p.rpart <- predict(m.rpart, wine_test)
summary(p.rpart)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.619 5.166 5.567 5.631 6.155 6.674
summary(wine_test$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.532 6.000 8.000
cor(p.rpart, wine_test$quality)
## [1] 0.480814
MAE<- function(actual, predicted) {
mean(abs(actual - predicted))
}
MAE(p.rpart, wine_test$quality)
## [1] 0.5178863
# mean absolute error between actual values and mean value
mean(wine_train$quality) # result = 5.87
## [1] 5.66
MAE(5.87, wine_test$quality)
## [1] 0.6492642
#install.packages("plyr")
#install.packages("Cubist")
library(Cubist)
## Loading required package: lattice
m.cubist <- cubist(x = wine_train[-12], y = wine_train$quality)
m.cubist
##
## Call:
## cubist.default(x = wine_train[-12], y = wine_train$quality)
##
## Number of samples: 1300
## Number of predictors: 11
##
## Number of committees: 1
## Number of rules: 11
summary(m.cubist)
##
## Call:
## cubist.default(x = wine_train[-12], y = wine_train$quality)
##
##
## Cubist [Release 2.07 GPL Edition] Mon Feb 9 23:09:48 2026
## ---------------------------------
##
## Target attribute `outcome'
##
## Read 1300 cases (12 attributes) from undefined.data
##
## Model:
##
## Rule 1: [98 cases, mean 5.2, range 3 to 7, est err 0.4]
##
## if
## volatile.acidity > 0.31
## pH <= 3.16
## sulphates <= 0.63
## then
## outcome = 26.4 + 0.361 alcohol + 1.04 sulphates - 0.45 pH
## + 0.0062 free.sulfur.dioxide - 0.0019 total.sulfur.dioxide
## - 0.29 volatile.acidity - 24 density
##
## Rule 2: [544 cases, mean 5.3, range 3 to 8, est err 0.4]
##
## if
## alcohol <= 9.8
## then
## outcome = 5.2 - 0.94 volatile.acidity - 0.0032 total.sulfur.dioxide
## + 0.096 alcohol + 0.26 sulphates - 0.6 chlorides - 0.12 pH
## + 0.0015 free.sulfur.dioxide
##
## Rule 3: [170 cases, mean 5.4, range 3 to 7, est err 0.6]
##
## if
## free.sulfur.dioxide <= 7
## pH > 3.16
## sulphates <= 0.63
## then
## outcome = 4.3 + 0.313 alcohol - 1.37 volatile.acidity + 0.93 sulphates
## - 0.115 residual.sugar - 0.5 pH
##
## Rule 4: [45 cases, mean 5.6, range 3 to 7, est err 0.7]
##
## if
## volatile.acidity > 0.31
## chlorides > 0.092
## total.sulfur.dioxide > 32
## sulphates > 0.63
## alcohol > 9.8
## then
## outcome = 6.3 - 8 chlorides - 0.0049 total.sulfur.dioxide
## - 0.88 volatile.acidity + 0.133 alcohol + 0.57 sulphates
## + 0.0036 free.sulfur.dioxide - 0.24 pH - 0.012 fixed.acidity
##
## Rule 5: [174 cases, mean 5.7, range 4 to 7, est err 0.5]
##
## if
## volatile.acidity > 0.31
## free.sulfur.dioxide > 7
## pH > 3.16
## sulphates <= 0.63
## alcohol > 9.8
## then
## outcome = 14 + 2.95 sulphates + 0.209 alcohol - 1.47 pH
## - 0.0058 total.sulfur.dioxide + 0.0018 free.sulfur.dioxide
## - 0.09 volatile.acidity - 7 density
##
## Rule 6: [137 cases, mean 6.1, range 5 to 8, est err 0.5]
##
## if
## volatile.acidity > 0.31
## total.sulfur.dioxide <= 32
## pH > 3.17
## sulphates > 0.63
## then
## outcome = 82.5 + 0.259 alcohol + 0.177 residual.sugar - 80 density
## + 0.45 citric.acid
##
## Rule 7: [126 cases, mean 6.1, range 5 to 8, est err 0.4]
##
## if
## volatile.acidity > 0.31
## residual.sugar > 1.8
## chlorides <= 0.092
## total.sulfur.dioxide > 32
## sulphates > 0.63
## alcohol > 9.8
## then
## outcome = 3.1 - 0.017 total.sulfur.dioxide + 0.339 alcohol
## + 0.074 residual.sugar + 0.06 sulphates
##
## Rule 8: [43 cases, mean 6.1, range 3 to 8, est err 0.5]
##
## if
## residual.sugar <= 1.8
## total.sulfur.dioxide > 32
## sulphates > 0.63
## alcohol > 9.8
## then
## outcome = 12.5 + 0.0243 total.sulfur.dioxide - 3.06 pH + 0.242 alcohol
## - 0.5 chlorides + 0.06 sulphates
##
## Rule 9: [35 cases, mean 6.3, range 5 to 8, est err 0.5]
##
## if
## volatile.acidity > 0.31
## total.sulfur.dioxide <= 32
## pH <= 3.17
## sulphates > 0.63
## alcohol > 9.8
## then
## outcome = 162.1 + 3.85 pH + 10.5 chlorides - 169 density
## + 0.032 residual.sugar + 0.16 citric.acid
##
## Rule 10: [58 cases, mean 6.3, range 5 to 8, est err 0.5]
##
## if
## volatile.acidity <= 0.31
## sulphates <= 0.73
## alcohol > 9.8
## then
## outcome = 504.3 + 5.71 volatile.acidity - 503 density
## + 0.313 fixed.acidity + 2.77 sulphates - 0.33 alcohol
## + 0.214 residual.sugar - 0.0047 total.sulfur.dioxide
##
## Rule 11: [75 cases, mean 6.5, range 5 to 8, est err 0.4]
##
## if
## volatile.acidity <= 0.31
## sulphates > 0.73
## then
## outcome = 9.2 + 0.404 alcohol + 1.53 citric.acid + 0.22 volatile.acidity
## + 0.11 sulphates - 8 density + 0.008 fixed.acidity
##
##
## Evaluation on training data (1300 cases):
##
## Average |error| 0.4
## Relative |error| 0.58
## Correlation coefficient 0.66
##
##
## Attribute usage:
## Conds Model
##
## 68% 98% alcohol
## 64% 89% sulphates
## 50% 77% volatile.acidity
## 41% 74% pH
## 26% 72% total.sulfur.dioxide
## 23% 57% free.sulfur.dioxide
## 11% 44% chlorides
## 11% 35% residual.sugar
## 38% density
## 16% citric.acid
## 12% fixed.acidity
##
##
## Time: 0.0 secs
p.cubist <- predict(m.cubist, wine_test)
summary(p.cubist)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.422 5.164 5.584 5.615 5.974 7.040
cor(p.cubist, wine_test$quality)
## [1] 0.5387365
MAE(wine_test$quality, p.cubist)
## [1] 0.4995739
#**************************** #RED WINE #***************************
whitewine <- read.csv("winequality-white.csv")
str(whitewine)
## 'data.frame': 4898 obs. of 1 variable:
## $ fixed.acidity.volatile.acidity.citric.acid.residual.sugar.chlorides.free.sulfur.dioxide.total.sulfur.dioxide.density.pH.sulphates.alcohol.quality: chr "7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6" "6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9.5;6" "8.1;0.28;0.4;6.9;0.05;30;97;0.9951;3.26;0.44;10.1;6" "7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4;9.9;6" ...
#hist(whitewine$quality)
#Error in `hist.default()`:
#! 'x' must be numeric
#It was giving me this error with the file