Use Limited Data (mostly Chemical Composition) and Machine Learning.
see: https://archive.ics.uci.edu/ml/datasets/wine
require(rpart, quietly = TRUE)
require(keras, quietly = TRUE)
require(xgboost, quietly = TRUE)
require(glmnet, quietly = TRUE)
## Loaded glmnet 2.0-16
require(RColorBrewer, quietly = TRUE)
Rsq = function(y_pred, y_true) sprintf("R-sq = %.4f",cor(y_pred, y_true)**2)
red = read.csv(
"https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv",
sep=";")
print(dim(red))
## [1] 1599 12
white = read.csv(
"https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv",
sep=";")
print(dim(white))
## [1] 4898 12
wine = rbind(
cbind(wine.color="red",red),
cbind(wine.color="white",white)
)
rm(red,white)
tbl = with(wine, table(quality,wine.color))
plot( tbl, col=c("red2","lightgrey"))
It is easy to predict wine color.
color.rp = rpart(wine.color~.,data=wine,parms=list(prior=c(0.5,0.5)))
plotcp(color.rp)
print(color.rp)
## n= 6497
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 6497 3248.50000 red (0.50000000 0.50000000)
## 2) chlorides>=0.0585 1912 295.13730 red (0.90989463 0.09010537)
## 4) total.sulfur.dioxide< 148.5 1658 131.98270 red (0.95737076 0.04262924) *
## 5) total.sulfur.dioxide>=148.5 254 16.25266 white (0.09059088 0.90940912) *
## 3) chlorides< 0.0585 4585 268.16890 white (0.08324266 0.91675734)
## 6) total.sulfur.dioxide< 54 145 35.81441 red (0.83771501 0.16228499) *
## 7) total.sulfur.dioxide>=54 4440 83.29487 white (0.02775716 0.97224284) *
for(col in names(wine)[2:12]){
boxplot(wine[[col]]~wine$quality,col=brewer.pal(n = 7, name = "RdBu"),
ylab=col,xlab='Quality',varwidth=TRUE)
}
### Predictor Interactions
smp = sample(nrow(wine),500)
plot(wine[smp,2:6],panel=panel.smooth)
plot(wine[smp,7:12],panel=panel.smooth)
## Cross Validation Prep
n_fold = 3
n_repeat = 5
set.seed(2019)
cv_folds = caret::createMultiFolds(wine$quality, k = n_fold, times = n_repeat)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
oof_rpart = matrix(0,nrow=nrow(wine),ncol=n_repeat)
i = 1
rep = 1
for(fold_ in cv_folds){
wine.rp = rpart(quality~.,data=wine[fold_,],cp=0.001)
#plotcp(wine.rp)
wine.rp.opt_id = which.min(apply(as.matrix(wine.rp$cptable)[,4:5],1,sum))
wine.rp.opt_cp = as.matrix(wine.rp$cptable)[wine.rp.opt_id,1]
oof_rpart[-fold_,rep] = predict(prune(wine.rp,wine.rp.opt_cp),
newdata=wine[-fold_,])
i = i + 1
if(i %% n_fold == 1){
rep = rep + 1
}
}
pred_rp = apply(oof_rpart,1,mean)
#summary(prune(wine.rp,0.005))
plot(prune(wine.rp,0.006),margin=0.1)
text(prune(wine.rp,0.006),cex=0.63)
boxplot(pred_rp~wine$quality,varwidth=TRUE,col="cornsilk",
sub=Rsq(pred_rp,wine$quality))
oof_lm = matrix(0,nrow=nrow(wine),ncol=n_repeat)
i = 1
rep = 1
for(fold_ in cv_folds){
wine.lm = lm(quality~.+alcohol:volatile.acidity,data=wine[fold_,])
oof_lm[-fold_,rep] = predict(wine.lm,newdata=wine[-fold_,])
i = i + 1
if(i %% n_fold == 1){
rep = rep + 1
}
}
pred_lm = apply(oof_lm,1,mean)
summary(wine.lm)
##
## Call:
## lm(formula = quality ~ . + alcohol:volatile.acidity, data = wine[fold_,
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3272 -0.4596 -0.0398 0.4492 2.6994
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.003e+02 1.687e+01 5.948 2.92e-09 ***
## wine.colorwhite -3.206e-01 6.879e-02 -4.660 3.25e-06 ***
## fixed.acidity 9.296e-02 1.926e-02 4.826 1.44e-06 ***
## volatile.acidity -2.628e+00 7.114e-01 -3.694 0.000224 ***
## citric.acid -1.397e-01 9.636e-02 -1.450 0.147258
## residual.sugar 5.920e-02 7.104e-03 8.334 < 2e-16 ***
## chlorides -6.490e-01 4.290e-01 -1.513 0.130409
## free.sulfur.dioxide 6.058e-03 9.599e-04 6.311 3.05e-10 ***
## total.sulfur.dioxide -1.509e-03 3.957e-04 -3.814 0.000139 ***
## density -9.926e+01 1.707e+01 -5.815 6.50e-09 ***
## pH 5.131e-01 1.101e-01 4.659 3.28e-06 ***
## sulphates 7.446e-01 9.302e-02 8.005 1.53e-15 ***
## alcohol 1.908e-01 3.436e-02 5.552 2.99e-08 ***
## volatile.acidity:alcohol 1.069e-01 6.770e-02 1.580 0.114293
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7348 on 4317 degrees of freedom
## Multiple R-squared: 0.2978, Adjusted R-squared: 0.2957
## F-statistic: 140.8 on 13 and 4317 DF, p-value: < 2.2e-16
plot(wine.lm)
boxplot(pred_lm~wine$quality,varwidth=TRUE,col="cornsilk",
sub=Rsq(pred_lm,wine$quality))
oof_xgb = matrix(0,nrow=nrow(wine),ncol=n_repeat)
params <- list(
max_depth = 4,
eta = 0.1,
min_child_weight = 50,
#eval_metric=c('ndcg'),
objective = "reg:linear",
subsample = 0.6,
colsample_bytree = 0.8
)
i = 1
rep = 1
for(fold_ in cv_folds){
dtrain <- xgb.DMatrix(model.matrix(quality~.-1,data=wine[fold_,]),
label = wine$quality[fold_])
dtest <- xgb.DMatrix(model.matrix(quality~.-1,data=wine[-fold_,]),
label = wine$quality[-fold_])
wine.xgb <- xgb.train(
params = params, data = dtrain, nrounds = 1500,verbose = 0,
watchlist = list(val=dtest,trn=dtrain),
early_stopping_rounds = 100)
oof_xgb[-fold_,rep] = predict(wine.xgb,dtest)
i = i + 1
if(i %% n_fold == 1){
rep = rep + 1
}
}
xgb.plot.importance(xgb.importance(model=wine.xgb),measure = "Gain")
pred.xgb = apply(oof_xgb,1,mean)
#summary(pred.xgb)
boxplot(pred.xgb~wine$quality,varwidth=TRUE,col="cornsilk",
sub=Rsq(pred.xgb,wine$quality))
mat = model.matrix(quality~.-1,data=wine)
estimate_fc_stats = function(m) {
m = as.matrix(m)
stdev = apply(m,2,sd)
avg = apply(m,2,mean)
data.frame(mean=avg,stdev=stdev)
}
fc_stats = estimate_fc_stats(mat)
fc_standardize = function(m){
t(apply(m,1,function(x) (x - fc_stats$mean) / fc_stats$stdev))
}
mat_norm = fc_standardize(mat)
# summary(mat_norm)
make_model <- function(){
model <- keras_model_sequential()
model %>%
layer_dense(units = 64, activation = 'relu', input_shape = c(ncol(mat))) %>%
layer_dropout(rate = 0.35) %>%
layer_dense(units = 32, activation = 'relu') %>%
layer_dropout(rate = 0.35) %>%
layer_dense(units = 1, activation = 'linear')
}
#summary(make_model())
oof_fc = matrix(0,nrow=nrow(wine),ncol=n_repeat)
i = 1
rep = 1
for(fold_ in cv_folds){
wine.fc <- make_model()
wine.fc %>% compile(
loss = 'mse',
optimizer = optimizer_rmsprop(lr = 0.003, rho = 0.90, decay = 2e-4, clipvalue = 0.2)
)
history <- wine.fc %>% fit(
mat_norm[fold_,], wine$quality[fold_],
batch_size = 1024,
epochs = 1000,
verbose = 0,
#class_weight = as.list(rep(1,7)/7),
validation_data = list(mat_norm[-fold_,], wine$quality[-fold_]),
callbacks = list(
callback_early_stopping(patience = 100, verbose = 1, restore_best_weights = TRUE)
)
)
print(history)
#plot(history)
oof_fc[-fold_,rep] = predict(wine.fc, mat_norm[-fold_,])
i = i + 1
if(i %% n_fold == 1){
rep = rep + 1
}
}
## Trained on 4,331 samples, validated on 2,166 samples (batch_size=1,024, epochs=553)
## Final epoch (plot to see history):
## loss: 0.4678
## val_loss: 0.4594
## Trained on 4,332 samples, validated on 2,165 samples (batch_size=1,024, epochs=663)
## Final epoch (plot to see history):
## loss: 0.4528
## val_loss: 0.4748
## Trained on 4,331 samples, validated on 2,166 samples (batch_size=1,024, epochs=938)
## Final epoch (plot to see history):
## loss: 0.4502
## val_loss: 0.4515
## Trained on 4,331 samples, validated on 2,166 samples (batch_size=1,024, epochs=633)
## Final epoch (plot to see history):
## loss: 0.4576
## val_loss: 0.4875
## Trained on 4,331 samples, validated on 2,166 samples (batch_size=1,024, epochs=639)
## Final epoch (plot to see history):
## loss: 0.4603
## val_loss: 0.4548
## Trained on 4,332 samples, validated on 2,165 samples (batch_size=1,024, epochs=1,000)
## Final epoch (plot to see history):
## loss: 0.4539
## val_loss: 0.4535
## Trained on 4,331 samples, validated on 2,166 samples (batch_size=1,024, epochs=725)
## Final epoch (plot to see history):
## loss: 0.4569
## val_loss: 0.449
## Trained on 4,332 samples, validated on 2,165 samples (batch_size=1,024, epochs=812)
## Final epoch (plot to see history):
## loss: 0.4473
## val_loss: 0.4728
## Trained on 4,331 samples, validated on 2,166 samples (batch_size=1,024, epochs=767)
## Final epoch (plot to see history):
## loss: 0.4637
## val_loss: 0.4572
## Trained on 4,331 samples, validated on 2,166 samples (batch_size=1,024, epochs=705)
## Final epoch (plot to see history):
## loss: 0.4576
## val_loss: 0.473
## Trained on 4,332 samples, validated on 2,165 samples (batch_size=1,024, epochs=1,000)
## Final epoch (plot to see history):
## loss: 0.422
## val_loss: 0.4707
## Trained on 4,331 samples, validated on 2,166 samples (batch_size=1,024, epochs=616)
## Final epoch (plot to see history):
## loss: 0.479
## val_loss: 0.4433
## Trained on 4,332 samples, validated on 2,165 samples (batch_size=1,024, epochs=736)
## Final epoch (plot to see history):
## loss: 0.4602
## val_loss: 0.4705
## Trained on 4,331 samples, validated on 2,166 samples (batch_size=1,024, epochs=751)
## Final epoch (plot to see history):
## loss: 0.4595
## val_loss: 0.4686
## Trained on 4,331 samples, validated on 2,166 samples (batch_size=1,024, epochs=585)
## Final epoch (plot to see history):
## loss: 0.4714
## val_loss: 0.4552
pred.fc = apply(oof_fc,1,mean)
#summary(pred.fc)
boxplot(pred.fc~wine$quality,varwidth=TRUE,col="cornsilk",
sub=Rsq(pred.fc,wine$quality))
mod_names = ls(pattern="oof_")
mods = mget(mod_names)
for(m in names(mods)){
colnames(mods[[m]]) = paste(m,1:ncol(mods[[m]]),sep="_")
}
stack = do.call("cbind",mods)
summary(stack)
## oof_fc_1 oof_fc_2 oof_fc_3 oof_fc_4
## Min. :4.017 Min. :4.019 Min. :4.446 Min. :4.093
## 1st Qu.:5.381 1st Qu.:5.355 1st Qu.:5.378 1st Qu.:5.366
## Median :5.784 Median :5.790 Median :5.771 Median :5.776
## Mean :5.801 Mean :5.829 Mean :5.809 Mean :5.792
## 3rd Qu.:6.163 3rd Qu.:6.222 3rd Qu.:6.187 3rd Qu.:6.165
## Max. :7.651 Max. :7.710 Max. :7.629 Max. :7.722
## oof_fc_5 oof_lm_1 oof_lm_2 oof_lm_3
## Min. :4.198 Min. :3.286 Min. :3.245 Min. :3.592
## 1st Qu.:5.393 1st Qu.:5.481 1st Qu.:5.481 1st Qu.:5.480
## Median :5.790 Median :5.815 Median :5.812 Median :5.811
## Mean :5.815 Mean :5.819 Mean :5.818 Mean :5.818
## 3rd Qu.:6.177 3rd Qu.:6.166 3rd Qu.:6.169 3rd Qu.:6.169
## Max. :7.636 Max. :7.697 Max. :7.787 Max. :7.772
## oof_lm_4 oof_lm_5 oof_rpart_1 oof_rpart_2
## Min. :3.196 Min. :3.480 Min. :4.286 Min. :4.143
## 1st Qu.:5.481 1st Qu.:5.479 1st Qu.:5.342 1st Qu.:5.331
## Median :5.811 Median :5.815 Median :5.735 Median :5.764
## Mean :5.818 Mean :5.818 Mean :5.817 Mean :5.822
## 3rd Qu.:6.167 3rd Qu.:6.164 3rd Qu.:6.211 3rd Qu.:6.215
## Max. :7.704 Max. :7.685 Max. :7.636 Max. :7.562
## oof_rpart_3 oof_rpart_4 oof_rpart_5 oof_xgb_1
## Min. :4.000 Min. :3.857 Min. :3.875 Min. :3.713
## 1st Qu.:5.365 1st Qu.:5.387 1st Qu.:5.368 1st Qu.:5.320
## Median :5.827 Median :5.787 Median :5.763 Median :5.783
## Mean :5.819 Mean :5.818 Mean :5.817 Mean :5.825
## 3rd Qu.:6.188 3rd Qu.:6.118 3rd Qu.:6.216 3rd Qu.:6.274
## Max. :7.381 Max. :8.000 Max. :7.500 Max. :8.155
## oof_xgb_2 oof_xgb_3 oof_xgb_4 oof_xgb_5
## Min. :3.562 Min. :3.761 Min. :3.496 Min. :3.207
## 1st Qu.:5.332 1st Qu.:5.318 1st Qu.:5.310 1st Qu.:5.320
## Median :5.783 Median :5.776 Median :5.795 Median :5.798
## Mean :5.823 Mean :5.823 Mean :5.819 Mean :5.826
## 3rd Qu.:6.263 3rd Qu.:6.263 3rd Qu.:6.248 3rd Qu.:6.281
## Max. :8.072 Max. :8.080 Max. :8.037 Max. :8.225
heatmap(cor(stack),main="Base Model Correlations",symm=TRUE)
library(penalized)
## Loading required package: survival
## Welcome to penalized. For extended examples, see vignette("penalized").
stack.lm = penalized(wine$quality,stack, ~0, positive=TRUE, trace=FALSE)
barplot(coef(stack.lm),las=3,main="Stacking Coefficients")
pred.stack = predict(stack.lm,stack)[,1]
#summary(pred.stack)
boxplot(pred.stack~wine$quality,varwidth=TRUE,
col="cornsilk",main="Stacking Prediction Distribution",
sub=Rsq(pred.stack,wine$quality))