ANALYSIS OF WINE QUALITY DATA USING R

Use Limited Data (mostly Chemical Composition) and Machine Learning.

see: https://archive.ics.uci.edu/ml/datasets/wine

require(rpart, quietly = TRUE)
require(keras, quietly = TRUE)
require(xgboost, quietly = TRUE)
require(glmnet, quietly = TRUE)
## Loaded glmnet 2.0-16
require(RColorBrewer, quietly = TRUE)

Rsq = function(y_pred, y_true) sprintf("R-sq = %.4f",cor(y_pred, y_true)**2)

red = read.csv(
  "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv",
  sep=";")
print(dim(red))
## [1] 1599   12
white = read.csv(
  "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv",
  sep=";")
print(dim(white))
## [1] 4898   12
wine = rbind(
  cbind(wine.color="red",red),
  cbind(wine.color="white",white)
  )

rm(red,white)

Exploratory Data Analysis

Wine Color

tbl = with(wine, table(quality,wine.color))
plot( tbl, col=c("red2","lightgrey"))

It is easy to predict wine color.

color.rp = rpart(wine.color~.,data=wine,parms=list(prior=c(0.5,0.5)))
plotcp(color.rp)

print(color.rp)
## n= 6497 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 6497 3248.50000 red (0.50000000 0.50000000)  
##   2) chlorides>=0.0585 1912  295.13730 red (0.90989463 0.09010537)  
##     4) total.sulfur.dioxide< 148.5 1658  131.98270 red (0.95737076 0.04262924) *
##     5) total.sulfur.dioxide>=148.5 254   16.25266 white (0.09059088 0.90940912) *
##   3) chlorides< 0.0585 4585  268.16890 white (0.08324266 0.91675734)  
##     6) total.sulfur.dioxide< 54 145   35.81441 red (0.83771501 0.16228499) *
##     7) total.sulfur.dioxide>=54 4440   83.29487 white (0.02775716 0.97224284) *

One Way Analysis

for(col in names(wine)[2:12]){
  boxplot(wine[[col]]~wine$quality,col=brewer.pal(n = 7, name = "RdBu"),
          ylab=col,xlab='Quality',varwidth=TRUE)
}

### Predictor Interactions

smp = sample(nrow(wine),500)
plot(wine[smp,2:6],panel=panel.smooth)

plot(wine[smp,7:12],panel=panel.smooth)

## Cross Validation Prep

n_fold = 3
n_repeat = 5
set.seed(2019)
cv_folds = caret::createMultiFolds(wine$quality, k = n_fold, times = n_repeat)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang

Decision Tree Analysis

oof_rpart = matrix(0,nrow=nrow(wine),ncol=n_repeat)

i = 1
rep = 1
for(fold_ in cv_folds){
  wine.rp = rpart(quality~.,data=wine[fold_,],cp=0.001)
  #plotcp(wine.rp)
  wine.rp.opt_id = which.min(apply(as.matrix(wine.rp$cptable)[,4:5],1,sum))
  wine.rp.opt_cp = as.matrix(wine.rp$cptable)[wine.rp.opt_id,1]
  
  oof_rpart[-fold_,rep] = predict(prune(wine.rp,wine.rp.opt_cp), 
                                  newdata=wine[-fold_,])
  
  i = i + 1
  if(i  %% n_fold == 1){
    rep = rep + 1    
  }
}


pred_rp =  apply(oof_rpart,1,mean)

#summary(prune(wine.rp,0.005))
plot(prune(wine.rp,0.006),margin=0.1)
text(prune(wine.rp,0.006),cex=0.63)

boxplot(pred_rp~wine$quality,varwidth=TRUE,col="cornsilk",
        sub=Rsq(pred_rp,wine$quality))

OLS

oof_lm = matrix(0,nrow=nrow(wine),ncol=n_repeat)

i = 1
rep = 1
for(fold_ in cv_folds){
  wine.lm = lm(quality~.+alcohol:volatile.acidity,data=wine[fold_,])

  oof_lm[-fold_,rep] = predict(wine.lm,newdata=wine[-fold_,])
  
  i = i + 1
  if(i  %% n_fold == 1){
    rep = rep + 1    
  }
}

pred_lm =  apply(oof_lm,1,mean)

summary(wine.lm)
## 
## Call:
## lm(formula = quality ~ . + alcohol:volatile.acidity, data = wine[fold_, 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3272 -0.4596 -0.0398  0.4492  2.6994 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               1.003e+02  1.687e+01   5.948 2.92e-09 ***
## wine.colorwhite          -3.206e-01  6.879e-02  -4.660 3.25e-06 ***
## fixed.acidity             9.296e-02  1.926e-02   4.826 1.44e-06 ***
## volatile.acidity         -2.628e+00  7.114e-01  -3.694 0.000224 ***
## citric.acid              -1.397e-01  9.636e-02  -1.450 0.147258    
## residual.sugar            5.920e-02  7.104e-03   8.334  < 2e-16 ***
## chlorides                -6.490e-01  4.290e-01  -1.513 0.130409    
## free.sulfur.dioxide       6.058e-03  9.599e-04   6.311 3.05e-10 ***
## total.sulfur.dioxide     -1.509e-03  3.957e-04  -3.814 0.000139 ***
## density                  -9.926e+01  1.707e+01  -5.815 6.50e-09 ***
## pH                        5.131e-01  1.101e-01   4.659 3.28e-06 ***
## sulphates                 7.446e-01  9.302e-02   8.005 1.53e-15 ***
## alcohol                   1.908e-01  3.436e-02   5.552 2.99e-08 ***
## volatile.acidity:alcohol  1.069e-01  6.770e-02   1.580 0.114293    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7348 on 4317 degrees of freedom
## Multiple R-squared:  0.2978, Adjusted R-squared:  0.2957 
## F-statistic: 140.8 on 13 and 4317 DF,  p-value: < 2.2e-16
plot(wine.lm)

boxplot(pred_lm~wine$quality,varwidth=TRUE,col="cornsilk",
        sub=Rsq(pred_lm,wine$quality))

XGB Model

oof_xgb = matrix(0,nrow=nrow(wine),ncol=n_repeat)

params <- list(
  max_depth = 4, 
  eta = 0.1, 
  min_child_weight = 50,
  #eval_metric=c('ndcg'),
  objective = "reg:linear",
  subsample = 0.6,
  colsample_bytree = 0.8
)

i = 1
rep = 1
for(fold_ in cv_folds){
  dtrain <- xgb.DMatrix(model.matrix(quality~.-1,data=wine[fold_,]), 
                        label = wine$quality[fold_])
  dtest <- xgb.DMatrix(model.matrix(quality~.-1,data=wine[-fold_,]), 
                        label = wine$quality[-fold_])
  wine.xgb <- xgb.train(
    params = params, data = dtrain, nrounds = 1500,verbose = 0,
    watchlist = list(val=dtest,trn=dtrain),
    early_stopping_rounds = 100)
  
  oof_xgb[-fold_,rep] = predict(wine.xgb,dtest)
  
  i = i + 1
  if(i  %% n_fold == 1){
    rep = rep + 1    
  }
}
xgb.plot.importance(xgb.importance(model=wine.xgb),measure = "Gain")

pred.xgb =  apply(oof_xgb,1,mean)
#summary(pred.xgb)
boxplot(pred.xgb~wine$quality,varwidth=TRUE,col="cornsilk",
        sub=Rsq(pred.xgb,wine$quality))

Deep Learning Model

mat = model.matrix(quality~.-1,data=wine)

estimate_fc_stats = function(m) {
  m = as.matrix(m)
  stdev = apply(m,2,sd)
  avg   = apply(m,2,mean)
  data.frame(mean=avg,stdev=stdev)
}

fc_stats = estimate_fc_stats(mat)

fc_standardize = function(m){
  t(apply(m,1,function(x) (x - fc_stats$mean) / fc_stats$stdev))
}

mat_norm = fc_standardize(mat)
# summary(mat_norm)

make_model <- function(){
  model <- keras_model_sequential()
  model %>%
    layer_dense(units = 64, activation = 'relu', input_shape = c(ncol(mat))) %>%
    layer_dropout(rate = 0.35) %>% 
    layer_dense(units = 32, activation = 'relu') %>%
    layer_dropout(rate = 0.35) %>% 
    layer_dense(units = 1, activation = 'linear')
}

#summary(make_model())


oof_fc = matrix(0,nrow=nrow(wine),ncol=n_repeat)


i = 1
rep = 1
for(fold_ in cv_folds){
  
  wine.fc <- make_model()
  
  wine.fc %>% compile(
    loss = 'mse',
    optimizer = optimizer_rmsprop(lr = 0.003, rho = 0.90, decay = 2e-4, clipvalue = 0.2)
  )
  
  history <- wine.fc %>% fit(
    mat_norm[fold_,], wine$quality[fold_],
    batch_size = 1024,
    epochs = 1000,
    verbose = 0,
    #class_weight = as.list(rep(1,7)/7),
    validation_data = list(mat_norm[-fold_,], wine$quality[-fold_]),
    callbacks = list(
      callback_early_stopping(patience = 100, verbose = 1, restore_best_weights = TRUE)
    )
  )
  
  print(history)
  #plot(history)
  
  oof_fc[-fold_,rep] = predict(wine.fc, mat_norm[-fold_,])
  
  i = i + 1
  if(i  %% n_fold == 1){
    rep = rep + 1    
  }
}
## Trained on 4,331 samples, validated on 2,166 samples (batch_size=1,024, epochs=553)
## Final epoch (plot to see history):
##     loss: 0.4678
## val_loss: 0.4594 
## Trained on 4,332 samples, validated on 2,165 samples (batch_size=1,024, epochs=663)
## Final epoch (plot to see history):
##     loss: 0.4528
## val_loss: 0.4748 
## Trained on 4,331 samples, validated on 2,166 samples (batch_size=1,024, epochs=938)
## Final epoch (plot to see history):
##     loss: 0.4502
## val_loss: 0.4515 
## Trained on 4,331 samples, validated on 2,166 samples (batch_size=1,024, epochs=633)
## Final epoch (plot to see history):
##     loss: 0.4576
## val_loss: 0.4875 
## Trained on 4,331 samples, validated on 2,166 samples (batch_size=1,024, epochs=639)
## Final epoch (plot to see history):
##     loss: 0.4603
## val_loss: 0.4548 
## Trained on 4,332 samples, validated on 2,165 samples (batch_size=1,024, epochs=1,000)
## Final epoch (plot to see history):
##     loss: 0.4539
## val_loss: 0.4535 
## Trained on 4,331 samples, validated on 2,166 samples (batch_size=1,024, epochs=725)
## Final epoch (plot to see history):
##     loss: 0.4569
## val_loss: 0.449 
## Trained on 4,332 samples, validated on 2,165 samples (batch_size=1,024, epochs=812)
## Final epoch (plot to see history):
##     loss: 0.4473
## val_loss: 0.4728 
## Trained on 4,331 samples, validated on 2,166 samples (batch_size=1,024, epochs=767)
## Final epoch (plot to see history):
##     loss: 0.4637
## val_loss: 0.4572 
## Trained on 4,331 samples, validated on 2,166 samples (batch_size=1,024, epochs=705)
## Final epoch (plot to see history):
##     loss: 0.4576
## val_loss: 0.473 
## Trained on 4,332 samples, validated on 2,165 samples (batch_size=1,024, epochs=1,000)
## Final epoch (plot to see history):
##     loss: 0.422
## val_loss: 0.4707 
## Trained on 4,331 samples, validated on 2,166 samples (batch_size=1,024, epochs=616)
## Final epoch (plot to see history):
##     loss: 0.479
## val_loss: 0.4433 
## Trained on 4,332 samples, validated on 2,165 samples (batch_size=1,024, epochs=736)
## Final epoch (plot to see history):
##     loss: 0.4602
## val_loss: 0.4705 
## Trained on 4,331 samples, validated on 2,166 samples (batch_size=1,024, epochs=751)
## Final epoch (plot to see history):
##     loss: 0.4595
## val_loss: 0.4686 
## Trained on 4,331 samples, validated on 2,166 samples (batch_size=1,024, epochs=585)
## Final epoch (plot to see history):
##     loss: 0.4714
## val_loss: 0.4552
pred.fc =  apply(oof_fc,1,mean)
#summary(pred.fc)
boxplot(pred.fc~wine$quality,varwidth=TRUE,col="cornsilk",
        sub=Rsq(pred.fc,wine$quality))

Model Stacking

mod_names = ls(pattern="oof_")
mods = mget(mod_names)
for(m in names(mods)){
  colnames(mods[[m]]) = paste(m,1:ncol(mods[[m]]),sep="_")
}

stack = do.call("cbind",mods)
summary(stack)
##     oof_fc_1        oof_fc_2        oof_fc_3        oof_fc_4    
##  Min.   :4.017   Min.   :4.019   Min.   :4.446   Min.   :4.093  
##  1st Qu.:5.381   1st Qu.:5.355   1st Qu.:5.378   1st Qu.:5.366  
##  Median :5.784   Median :5.790   Median :5.771   Median :5.776  
##  Mean   :5.801   Mean   :5.829   Mean   :5.809   Mean   :5.792  
##  3rd Qu.:6.163   3rd Qu.:6.222   3rd Qu.:6.187   3rd Qu.:6.165  
##  Max.   :7.651   Max.   :7.710   Max.   :7.629   Max.   :7.722  
##     oof_fc_5        oof_lm_1        oof_lm_2        oof_lm_3    
##  Min.   :4.198   Min.   :3.286   Min.   :3.245   Min.   :3.592  
##  1st Qu.:5.393   1st Qu.:5.481   1st Qu.:5.481   1st Qu.:5.480  
##  Median :5.790   Median :5.815   Median :5.812   Median :5.811  
##  Mean   :5.815   Mean   :5.819   Mean   :5.818   Mean   :5.818  
##  3rd Qu.:6.177   3rd Qu.:6.166   3rd Qu.:6.169   3rd Qu.:6.169  
##  Max.   :7.636   Max.   :7.697   Max.   :7.787   Max.   :7.772  
##     oof_lm_4        oof_lm_5      oof_rpart_1     oof_rpart_2   
##  Min.   :3.196   Min.   :3.480   Min.   :4.286   Min.   :4.143  
##  1st Qu.:5.481   1st Qu.:5.479   1st Qu.:5.342   1st Qu.:5.331  
##  Median :5.811   Median :5.815   Median :5.735   Median :5.764  
##  Mean   :5.818   Mean   :5.818   Mean   :5.817   Mean   :5.822  
##  3rd Qu.:6.167   3rd Qu.:6.164   3rd Qu.:6.211   3rd Qu.:6.215  
##  Max.   :7.704   Max.   :7.685   Max.   :7.636   Max.   :7.562  
##   oof_rpart_3     oof_rpart_4     oof_rpart_5      oof_xgb_1    
##  Min.   :4.000   Min.   :3.857   Min.   :3.875   Min.   :3.713  
##  1st Qu.:5.365   1st Qu.:5.387   1st Qu.:5.368   1st Qu.:5.320  
##  Median :5.827   Median :5.787   Median :5.763   Median :5.783  
##  Mean   :5.819   Mean   :5.818   Mean   :5.817   Mean   :5.825  
##  3rd Qu.:6.188   3rd Qu.:6.118   3rd Qu.:6.216   3rd Qu.:6.274  
##  Max.   :7.381   Max.   :8.000   Max.   :7.500   Max.   :8.155  
##    oof_xgb_2       oof_xgb_3       oof_xgb_4       oof_xgb_5    
##  Min.   :3.562   Min.   :3.761   Min.   :3.496   Min.   :3.207  
##  1st Qu.:5.332   1st Qu.:5.318   1st Qu.:5.310   1st Qu.:5.320  
##  Median :5.783   Median :5.776   Median :5.795   Median :5.798  
##  Mean   :5.823   Mean   :5.823   Mean   :5.819   Mean   :5.826  
##  3rd Qu.:6.263   3rd Qu.:6.263   3rd Qu.:6.248   3rd Qu.:6.281  
##  Max.   :8.072   Max.   :8.080   Max.   :8.037   Max.   :8.225
heatmap(cor(stack),main="Base Model Correlations",symm=TRUE)

library(penalized)
## Loading required package: survival
## Welcome to penalized. For extended examples, see vignette("penalized").
stack.lm = penalized(wine$quality,stack, ~0, positive=TRUE, trace=FALSE)
barplot(coef(stack.lm),las=3,main="Stacking Coefficients")

pred.stack = predict(stack.lm,stack)[,1]
#summary(pred.stack)
boxplot(pred.stack~wine$quality,varwidth=TRUE,
        col="cornsilk",main="Stacking Prediction Distribution",
        sub=Rsq(pred.stack,wine$quality))