Creating ploting functions
point_plot <- function(pred_qual){
ggplot(wine_test, aes(quality, pred_qual)) +
geom_point() +
xlab('Observed Quality') +
ylab('Predicted Quality') +
stat_smooth(method = lm) +
xlim(3, 8) +
ylim(3.5, 7.5)
}
box_plot <- function(pred_qual){
box_glm <- ggplot(wine_test, aes(quality, pred_qual)) +
geom_boxplot(aes(group = cut_width(quality, 1))) +
xlab('Observed Quality') +
ylab('Predicted Quality') +
xlim(3, 8) +
ylim(3.5, 7.5)
}
Comparing machine learning methods using the Caret R Package for a red wine quality model
library(readr)
library(tidyverse)
library(caret)
wine <- read_delim('winequality-red.csv',
';',
escape_double = FALSE,
trim_ws = TRUE)
ggplot(wine, aes(quality)) +
geom_histogram(bins = 30)

rand_rows <- createDataPartition(wine$quality, p=0.625, list=FALSE)
wine_train <- wine[rand_rows, ]
wine_test <- wine[-rand_rows, ]
Running a Generalized Linear Model
m_glm <- train(quality ~ .,
data = wine_train,
method = 'glm',
preProcess = c('scale', 'center'),
trControl = trainControl(method = 'repeatedcv',
number = 10,
repeats = 5,
verboseIter = FALSE))
summary(m_glm)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.74514 -0.39068 -0.03602 0.46446 1.94156
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.6310000 0.0208497 270.076 < 2e-16 ***
## `\\`fixed acidity\\`` 0.0761863 0.0578013 1.318 0.1878
## `\\`volatile acidity\\`` -0.2041202 0.0286132 -7.134 1.89e-12 ***
## `\\`citric acid\\`` -0.0506371 0.0379069 -1.336 0.1819
## `\\`residual sugar\\`` 0.0008766 0.0268816 0.033 0.9740
## chlorides -0.1061580 0.0260272 -4.079 4.89e-05 ***
## `\\`free sulfur dioxide\\`` 0.0678305 0.0296675 2.286 0.0224 *
## `\\`total sulfur dioxide\\`` -0.1354089 0.0314033 -4.312 1.78e-05 ***
## density -0.0526224 0.0525585 -1.001 0.3170
## pH -0.0662858 0.0378737 -1.750 0.0804 .
## sulphates 0.1558800 0.0259767 6.001 2.75e-09 ***
## alcohol 0.2487445 0.0370546 6.713 3.21e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.43471)
##
## Null deviance: 652.84 on 999 degrees of freedom
## Residual deviance: 429.49 on 988 degrees of freedom
## AIC: 2018.7
##
## Number of Fisher Scoring iterations: 2
imp_glm <- varImp(m_glm)
wine_test$pred_qual_glm <- predict(m_glm, wine_test)
cor_glm <- cor(wine_test$quality, wine_test$pred_qual_glm)
mae_glm <- mean(abs(wine_test$quality - wine_test$pred_qual_glm))
pimp_glm <- ggplot(imp_glm, top = dim(imp_glm$importance)[1]) +
ggtitle('GLM')
point_glm <- point_plot(wine_test$pred_qual_glm) +
ggtitle('GLM')
box_glm <- box_plot(wine_test$pred_qual_glm) +
ggtitle('GLM')
cor_glm
## [1] 0.6163901
mae_glm
## [1] 0.4944364
imp_glm
## glm variable importance
##
## Overall
## `\\`volatile acidity\\`` 100.00
## alcohol 94.07
## sulphates 84.04
## `\\`total sulfur dioxide\\`` 60.26
## chlorides 56.98
## `\\`free sulfur dioxide\\`` 31.74
## pH 24.19
## `\\`citric acid\\`` 18.35
## `\\`fixed acidity\\`` 18.10
## density 13.64
## `\\`residual sugar\\`` 0.00
pimp_glm

point_glm

box_glm

Running a Random Forest model
m_rf <- train(quality ~ .,
data = wine_train,
method = 'rf',
preProcess = c('scale', 'center'),
trControl = trainControl(method = 'repeatedcv',
number = 10,
repeats = 5,
verboseIter = FALSE),
tuneGrid = expand.grid(mtry = seq(3, 9, 1)),
verbose = 0)
summary(m_rf)
## Length Class Mode
## call 5 -none- call
## type 1 -none- character
## predicted 1000 -none- numeric
## mse 500 -none- numeric
## rsq 500 -none- numeric
## oob.times 1000 -none- numeric
## importance 11 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 1000 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## xNames 11 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 1 -none- logical
## param 1 -none- list
imp_rf <- varImp(m_rf)
wine_test$pred_qual_rf <- predict(m_rf, wine_test)
cor_rf <- cor(wine_test$quality, wine_test$pred_qual_rf)
mae_rf <- mean(abs(wine_test$quality - wine_test$pred_qual_rf))
pimp_rf <- ggplot(imp_rf, top = dim(imp_rf$importance)[1]) +
ggtitle('Random Forest')
point_rf <- point_plot(wine_test$pred_qual_rf) +
ggtitle('Random Forest')
box_rf <- box_plot(wine_test$pred_qual_rf) +
ggtitle('Random Forest')
cor_rf
## [1] 0.6965844
mae_rf
## [1] 0.4340915
imp_rf
## rf variable importance
##
## Overall
## alcohol 100.0000
## sulphates 77.0694
## `volatile acidity` 61.4146
## density 27.9332
## `total sulfur dioxide` 26.5110
## `citric acid` 16.3673
## chlorides 13.9156
## `fixed acidity` 12.3706
## pH 5.6873
## `free sulfur dioxide` 0.5393
## `residual sugar` 0.0000
pimp_rf

point_rf

box_rf

Running a XGBoost model
library(xgboost)
m_xgbt <- train(quality ~ .,
data = wine_train,
method = 'xgbTree',
preProcess = c('scale', 'center'),
trControl = trainControl(method = 'repeatedcv',
number = 10,
repeats = 5,
verboseIter = FALSE),
tuneGrid = expand.grid(nrounds = 50,
max_depth = seq(1, 7, 1),
eta = c(0.1, 0.3, 0.5),
gamma = 0,
colsample_bytree = 0.8,
min_child_weight = 10,
subsample = seq(0.7, 1, 0.1)),
verbose = 0)
summary(m_xgbt)
## Length Class Mode
## handle 1 xgb.Booster.handle externalptr
## raw 82623 -none- raw
## niter 1 -none- numeric
## call 6 -none- call
## params 8 -none- list
## callbacks 0 -none- list
## feature_names 11 -none- character
## nfeatures 1 -none- numeric
## xNames 11 -none- character
## problemType 1 -none- character
## tuneValue 7 data.frame list
## obsLevels 1 -none- logical
## param 1 -none- list
imp_xgbt <- varImp(m_xgbt)
wine_test$pred_qual_xgbt <- predict(m_xgbt, wine_test)
cor_xgbt <- cor(wine_test$quality, wine_test$pred_qual_xgbt)
mae_xgbt <- mean(abs(wine_test$quality - wine_test$pred_qual_xgbt))
pimp_xgbt <- ggplot(imp_xgbt, top = dim(imp_rf$importance)[1]) +
ggtitle('XGBoost')
point_xgbt <- point_plot(wine_test$pred_qual_xgbt) +
ggtitle('XGBoost')
box_xgbt <- box_plot(wine_test$pred_qual_xgbt) +
ggtitle('XGBoost')
cor_xgbt
## [1] 0.664423
mae_xgbt
## [1] 0.454316
imp_xgbt
## xgbTree variable importance
##
## Overall
## alcohol 100.0000
## sulphates 67.5154
## `volatile acidity` 48.9039
## `total sulfur dioxide` 24.0145
## density 15.7746
## pH 14.3283
## `fixed acidity` 8.1326
## chlorides 7.9906
## `citric acid` 5.1575
## `residual sugar` 0.8567
## `free sulfur dioxide` 0.0000
pimp_xgbt

point_xgbt

box_xgbt

Comparing models
library(patchwork)
Fit_Trees <- tibble(Method = c('GLM', 'Random Forest', 'XGBoost'),
Correlation = c(cor_glm, cor_rf, cor_xgbt),
MAE = c(mae_glm, mae_rf, mae_xgbt))
Fit_Trees
## # A tibble: 3 x 3
## Method Correlation MAE
## <chr> <dbl> <dbl>
## 1 GLM 0.616 0.494
## 2 Random Forest 0.697 0.434
## 3 XGBoost 0.664 0.454
pimp_glm + pimp_rf + pimp_xgbt +
plot_layout(ncol = 3)

point_glm + point_rf + point_xgbt +
plot_layout(ncol = 3)

box_glm + box_rf + box_xgbt +
plot_layout(ncol = 3)

Best_Fit <- Fit_Trees %>%
filter(Correlation == max(Correlation)) %>%
select(Method)
Best_Fit
## # A tibble: 1 x 1
## Method
## <chr>
## 1 Random Forest
m_comp <- resamples(list(GLM = m_glm, RF=m_rf, XGBT=m_xgbt))
summary(m_comp)
##
## Call:
## summary.resamples(object = m_comp)
##
## Models: GLM, RF, XGBT
## Number of resamples: 50
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## GLM 0.4333245 0.4955506 0.5199473 0.5176068 0.5390099 0.5841643 0
## RF 0.3727423 0.4294515 0.4482026 0.4550239 0.4842996 0.5332090 0
## XGBT 0.3929829 0.4481075 0.4671578 0.4726427 0.4927862 0.5405050 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## GLM 0.5330731 0.6362550 0.6634441 0.6644395 0.7001193 0.7470267 0
## RF 0.4976832 0.5624795 0.5938968 0.6063846 0.6525742 0.7247424 0
## XGBT 0.4982951 0.5965353 0.6249988 0.6274140 0.6539871 0.7092745 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## GLM 0.1452399 0.2822405 0.3251516 0.3302616 0.3691090 0.4881496 0
## RF 0.3080842 0.3934321 0.4430471 0.4446784 0.4973608 0.5926010 0
## XGBT 0.2391429 0.3621147 0.4037764 0.4041452 0.4424818 0.5602899 0
comp_mae <- ggplot(m_comp, metric = 'MAE') +
ggtitle('MAE')
comp_rmse <- ggplot(m_comp, metric = 'RMSE') +
ggtitle('RMSE')
comp_rsq <- ggplot(m_comp, metric = 'Rsquared') +
ggtitle('R squared')
comp_mae + comp_rmse + comp_rsq +
plot_layout(ncol = 3)
