Creating ploting functions

point_plot <- function(pred_qual){
  ggplot(wine_test, aes(quality, pred_qual)) +
  geom_point() +
  xlab('Observed Quality') +
  ylab('Predicted Quality') +
  stat_smooth(method = lm) +
  xlim(3, 8) +
  ylim(3.5, 7.5)
}

box_plot <- function(pred_qual){
  box_glm <- ggplot(wine_test, aes(quality, pred_qual)) +
  geom_boxplot(aes(group = cut_width(quality, 1))) +
  xlab('Observed Quality') +
  ylab('Predicted Quality') +
  xlim(3, 8) +
  ylim(3.5, 7.5)
}

Comparing machine learning methods using the Caret R Package for a red wine quality model

library(readr)
library(tidyverse)
library(caret)

wine <- read_delim('winequality-red.csv',
                                ';',
                                escape_double = FALSE,
                                trim_ws = TRUE)

ggplot(wine, aes(quality)) +
  geom_histogram(bins = 30)

rand_rows <- createDataPartition(wine$quality, p=0.625, list=FALSE)
wine_train <- wine[rand_rows, ]
wine_test <- wine[-rand_rows, ]

Running a Generalized Linear Model

m_glm <- train(quality ~ .,
               data = wine_train,
               method = 'glm',
               preProcess = c('scale', 'center'),
               trControl = trainControl(method = 'repeatedcv', 
                                        number = 10, 
                                        repeats = 5, 
                                        verboseIter = FALSE))
summary(m_glm)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.74514  -0.39068  -0.03602   0.46446   1.94156  
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   5.6310000  0.0208497 270.076  < 2e-16 ***
## `\\`fixed acidity\\``         0.0761863  0.0578013   1.318   0.1878    
## `\\`volatile acidity\\``     -0.2041202  0.0286132  -7.134 1.89e-12 ***
## `\\`citric acid\\``          -0.0506371  0.0379069  -1.336   0.1819    
## `\\`residual sugar\\``        0.0008766  0.0268816   0.033   0.9740    
## chlorides                    -0.1061580  0.0260272  -4.079 4.89e-05 ***
## `\\`free sulfur dioxide\\``   0.0678305  0.0296675   2.286   0.0224 *  
## `\\`total sulfur dioxide\\`` -0.1354089  0.0314033  -4.312 1.78e-05 ***
## density                      -0.0526224  0.0525585  -1.001   0.3170    
## pH                           -0.0662858  0.0378737  -1.750   0.0804 .  
## sulphates                     0.1558800  0.0259767   6.001 2.75e-09 ***
## alcohol                       0.2487445  0.0370546   6.713 3.21e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.43471)
## 
##     Null deviance: 652.84  on 999  degrees of freedom
## Residual deviance: 429.49  on 988  degrees of freedom
## AIC: 2018.7
## 
## Number of Fisher Scoring iterations: 2
imp_glm <- varImp(m_glm)

wine_test$pred_qual_glm <- predict(m_glm, wine_test)

cor_glm <- cor(wine_test$quality, wine_test$pred_qual_glm)
mae_glm <- mean(abs(wine_test$quality - wine_test$pred_qual_glm))

pimp_glm <- ggplot(imp_glm, top = dim(imp_glm$importance)[1]) +
  ggtitle('GLM')
point_glm <- point_plot(wine_test$pred_qual_glm) +
  ggtitle('GLM')
box_glm <- box_plot(wine_test$pred_qual_glm) +
  ggtitle('GLM')

cor_glm
## [1] 0.6163901
mae_glm
## [1] 0.4944364
imp_glm
## glm variable importance
## 
##                              Overall
## `\\`volatile acidity\\``      100.00
## alcohol                        94.07
## sulphates                      84.04
## `\\`total sulfur dioxide\\``   60.26
## chlorides                      56.98
## `\\`free sulfur dioxide\\``    31.74
## pH                             24.19
## `\\`citric acid\\``            18.35
## `\\`fixed acidity\\``          18.10
## density                        13.64
## `\\`residual sugar\\``          0.00
pimp_glm

point_glm

box_glm

Running a Random Forest model

m_rf <- train(quality ~ .,
              data = wine_train,
              method = 'rf',
              preProcess = c('scale', 'center'),
              trControl = trainControl(method = 'repeatedcv', 
                                       number = 10, 
                                       repeats = 5, 
                                       verboseIter = FALSE),
              tuneGrid = expand.grid(mtry = seq(3, 9, 1)),
              verbose = 0)
summary(m_rf)
##                 Length Class      Mode     
## call               5   -none-     call     
## type               1   -none-     character
## predicted       1000   -none-     numeric  
## mse              500   -none-     numeric  
## rsq              500   -none-     numeric  
## oob.times       1000   -none-     numeric  
## importance        11   -none-     numeric  
## importanceSD       0   -none-     NULL     
## localImportance    0   -none-     NULL     
## proximity          0   -none-     NULL     
## ntree              1   -none-     numeric  
## mtry               1   -none-     numeric  
## forest            11   -none-     list     
## coefs              0   -none-     NULL     
## y               1000   -none-     numeric  
## test               0   -none-     NULL     
## inbag              0   -none-     NULL     
## xNames            11   -none-     character
## problemType        1   -none-     character
## tuneValue          1   data.frame list     
## obsLevels          1   -none-     logical  
## param              1   -none-     list
imp_rf <- varImp(m_rf)

wine_test$pred_qual_rf <- predict(m_rf, wine_test)

cor_rf <- cor(wine_test$quality, wine_test$pred_qual_rf)
mae_rf <- mean(abs(wine_test$quality - wine_test$pred_qual_rf))

pimp_rf <- ggplot(imp_rf, top = dim(imp_rf$importance)[1]) +
  ggtitle('Random Forest')
point_rf <- point_plot(wine_test$pred_qual_rf) +
  ggtitle('Random Forest')
box_rf <- box_plot(wine_test$pred_qual_rf) +
  ggtitle('Random Forest')

cor_rf
## [1] 0.6965844
mae_rf
## [1] 0.4340915
imp_rf
## rf variable importance
## 
##                         Overall
## alcohol                100.0000
## sulphates               77.0694
## `volatile acidity`      61.4146
## density                 27.9332
## `total sulfur dioxide`  26.5110
## `citric acid`           16.3673
## chlorides               13.9156
## `fixed acidity`         12.3706
## pH                       5.6873
## `free sulfur dioxide`    0.5393
## `residual sugar`         0.0000
pimp_rf

point_rf

box_rf

Running a XGBoost model

library(xgboost)

m_xgbt <- train(quality ~ .,
                   data = wine_train,
                   method = 'xgbTree',
                   preProcess = c('scale', 'center'),
                   trControl = trainControl(method = 'repeatedcv', 
                                            number = 10, 
                                            repeats = 5, 
                                            verboseIter = FALSE),
                   tuneGrid = expand.grid(nrounds = 50,
                                          max_depth = seq(1, 7, 1),
                                          eta = c(0.1, 0.3, 0.5),
                                          gamma = 0,
                                          colsample_bytree = 0.8,
                                          min_child_weight = 10,
                                          subsample = seq(0.7, 1, 0.1)),
                   verbose = 0)
summary(m_xgbt)
##               Length Class              Mode       
## handle            1  xgb.Booster.handle externalptr
## raw           82623  -none-             raw        
## niter             1  -none-             numeric    
## call              6  -none-             call       
## params            8  -none-             list       
## callbacks         0  -none-             list       
## feature_names    11  -none-             character  
## nfeatures         1  -none-             numeric    
## xNames           11  -none-             character  
## problemType       1  -none-             character  
## tuneValue         7  data.frame         list       
## obsLevels         1  -none-             logical    
## param             1  -none-             list
imp_xgbt <- varImp(m_xgbt)

wine_test$pred_qual_xgbt <- predict(m_xgbt, wine_test)

cor_xgbt <- cor(wine_test$quality, wine_test$pred_qual_xgbt)
mae_xgbt <- mean(abs(wine_test$quality - wine_test$pred_qual_xgbt))

pimp_xgbt <- ggplot(imp_xgbt, top = dim(imp_rf$importance)[1]) +
  ggtitle('XGBoost')
point_xgbt <- point_plot(wine_test$pred_qual_xgbt) +
  ggtitle('XGBoost')
box_xgbt <- box_plot(wine_test$pred_qual_xgbt) +
  ggtitle('XGBoost')

cor_xgbt
## [1] 0.664423
mae_xgbt
## [1] 0.454316
imp_xgbt
## xgbTree variable importance
## 
##                         Overall
## alcohol                100.0000
## sulphates               67.5154
## `volatile acidity`      48.9039
## `total sulfur dioxide`  24.0145
## density                 15.7746
## pH                      14.3283
## `fixed acidity`          8.1326
## chlorides                7.9906
## `citric acid`            5.1575
## `residual sugar`         0.8567
## `free sulfur dioxide`    0.0000
pimp_xgbt

point_xgbt

box_xgbt

Comparing models

library(patchwork)

Fit_Trees <- tibble(Method = c('GLM', 'Random Forest', 'XGBoost'),
                    Correlation = c(cor_glm, cor_rf, cor_xgbt),
                    MAE = c(mae_glm, mae_rf, mae_xgbt))
Fit_Trees
## # A tibble: 3 x 3
##   Method        Correlation   MAE
##   <chr>               <dbl> <dbl>
## 1 GLM                 0.616 0.494
## 2 Random Forest       0.697 0.434
## 3 XGBoost             0.664 0.454
pimp_glm + pimp_rf + pimp_xgbt +
  plot_layout(ncol = 3)

point_glm + point_rf + point_xgbt +
  plot_layout(ncol = 3)

box_glm + box_rf + box_xgbt +
  plot_layout(ncol = 3)

Best_Fit <- Fit_Trees %>%
  filter(Correlation == max(Correlation)) %>%
  select(Method)
Best_Fit
## # A tibble: 1 x 1
##   Method       
##   <chr>        
## 1 Random Forest
m_comp <- resamples(list(GLM = m_glm, RF=m_rf, XGBT=m_xgbt))
summary(m_comp)
## 
## Call:
## summary.resamples(object = m_comp)
## 
## Models: GLM, RF, XGBT 
## Number of resamples: 50 
## 
## MAE 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## GLM  0.4333245 0.4955506 0.5199473 0.5176068 0.5390099 0.5841643    0
## RF   0.3727423 0.4294515 0.4482026 0.4550239 0.4842996 0.5332090    0
## XGBT 0.3929829 0.4481075 0.4671578 0.4726427 0.4927862 0.5405050    0
## 
## RMSE 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## GLM  0.5330731 0.6362550 0.6634441 0.6644395 0.7001193 0.7470267    0
## RF   0.4976832 0.5624795 0.5938968 0.6063846 0.6525742 0.7247424    0
## XGBT 0.4982951 0.5965353 0.6249988 0.6274140 0.6539871 0.7092745    0
## 
## Rsquared 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## GLM  0.1452399 0.2822405 0.3251516 0.3302616 0.3691090 0.4881496    0
## RF   0.3080842 0.3934321 0.4430471 0.4446784 0.4973608 0.5926010    0
## XGBT 0.2391429 0.3621147 0.4037764 0.4041452 0.4424818 0.5602899    0
comp_mae <- ggplot(m_comp, metric = 'MAE') +
  ggtitle('MAE')
comp_rmse <- ggplot(m_comp, metric = 'RMSE') +
  ggtitle('RMSE')
comp_rsq <- ggplot(m_comp, metric = 'Rsquared') +
  ggtitle('R squared')
comp_mae + comp_rmse + comp_rsq +
  plot_layout(ncol = 3)