Collecting data, exploring and preparing the data
wine <- read.csv("C:/Users/Justice2/Desktop/Machine Learning & Data Science/R/data/whitewines.csv")
str(wine)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
## $ volatile.acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
## $ citric.acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
## $ residual.sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
## $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
## $ free.sulfur.dioxide : num 45 14 30 47 47 30 30 45 14 28 ...
## $ total.sulfur.dioxide: num 170 132 97 186 186 97 136 170 132 129 ...
## $ density : num 1.001 0.994 0.995 0.996 0.996 ...
## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
## $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
## $ quality : int 6 6 6 6 6 6 6 6 6 6 ...
hist(wine$quality,col="brown")
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]
Training a model on the data
library(rpart)
m.rpart <- rpart(quality ~ ., data = wine_train)
m.rpart
## n= 3750
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 3750 3140.06000 5.886933
## 2) alcohol< 10.85 2473 1510.66200 5.609381
## 4) volatile.acidity>=0.2425 1406 740.15080 5.402560
## 8) volatile.acidity>=0.4225 182 92.99451 4.994505 *
## 9) volatile.acidity< 0.4225 1224 612.34560 5.463235 *
## 5) volatile.acidity< 0.2425 1067 631.12090 5.881912 *
## 3) alcohol>=10.85 1277 1069.95800 6.424432
## 6) free.sulfur.dioxide< 11.5 93 99.18280 5.473118 *
## 7) free.sulfur.dioxide>=11.5 1184 879.99920 6.499155
## 14) alcohol< 11.85 611 447.38130 6.296236 *
## 15) alcohol>=11.85 573 380.63180 6.715532 *
Visualizing decision trees
library(rpart.plot)
rpart.plot(m.rpart, digits = 3)
rpart.plot(m.rpart, digits = 4, fallen.leaves = TRUE,type = 3, extra = 101)
Evaluating model performance
p.rpart <- predict(m.rpart, wine_test)
summary(p.rpart)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.995 5.463 5.882 5.999 6.296 6.716
summary(wine_test$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.848 6.000 8.000
cor(p.rpart, wine_test$quality)
## [1] 0.4931608
Measuring performance with the mean absolute error
MAE <- function(actual, predicted) {
mean(abs(actual - predicted))
}
MAE(p.rpart, wine_test$quality)
## [1] 0.5732104
mean(wine_train$quality)
## [1] 5.886933
MAE(5.87, wine_test$quality)
## [1] 0.5815679
Improving model performance
library(RWeka)
m.m5p <- M5P(quality ~ ., data = wine_train)
m.m5p
## M5 pruned model tree:
## (using smoothed linear models)
##
## alcohol <= 10.85 : LM1 (2473/77.476%)
## alcohol > 10.85 :
## | free.sulfur.dioxide <= 20.5 :
## | | free.sulfur.dioxide <= 10.5 : LM2 (81/104.574%)
## | | free.sulfur.dioxide > 10.5 : LM3 (224/87.002%)
## | free.sulfur.dioxide > 20.5 : LM4 (972/84.073%)
##
## LM num: 1
## quality =
## 0.0777 * fixed.acidity
## - 2.3087 * volatile.acidity
## + 0.0732 * residual.sugar
## + 0.0022 * free.sulfur.dioxide
## - 155.0175 * density
## + 0.6462 * pH
## + 0.7923 * sulphates
## + 0.0758 * alcohol
## + 156.2102
##
## LM num: 2
## quality =
## -0.0314 * fixed.acidity
## - 0.3415 * volatile.acidity
## + 1.7929 * citric.acid
## + 0.1316 * residual.sugar
## - 0.2456 * chlorides
## + 0.1212 * free.sulfur.dioxide
## - 178.6281 * density
## + 0.054 * pH
## + 0.1392 * sulphates
## + 0.0108 * alcohol
## + 180.6069
##
## LM num: 3
## quality =
## -0.2019 * fixed.acidity
## - 2.3804 * volatile.acidity
## - 1.0851 * citric.acid
## + 0.0905 * residual.sugar
## - 0.2456 * chlorides
## + 0.0041 * free.sulfur.dioxide
## - 177.078 * density
## + 0.054 * pH
## + 0.0868 * sulphates
## + 0.0108 * alcohol
## + 183.5076
##
## LM num: 4
## quality =
## 0.0004 * fixed.acidity
## - 0.0325 * volatile.acidity
## + 0.0957 * residual.sugar
## - 5.9702 * chlorides
## + 0.0002 * free.sulfur.dioxide
## - 172.3931 * density
## + 1.0123 * pH
## + 1.1653 * sulphates
## + 0.1542 * alcohol
## + 171.6842
##
## Number of Rules : 4
summary(m.m5p)
##
## === Summary ===
##
## Correlation coefficient 0.5932
## Mean absolute error 0.5804
## Root mean squared error 0.7367
## Relative absolute error 83.3671 %
## Root relative squared error 80.507 %
## Total Number of Instances 3750
p.m5p <- predict(m.m5p, wine_test)
summary(p.m5p)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.170 5.646 6.032 6.079 6.501 7.913
cor(p.m5p, wine_test$quality)
## [1] 0.531723
MAE(wine_test$quality, p.m5p)
## [1] 0.5660352