Study of Wine Quality

Synopsis

In this report we aim to study the various factors that influence the Quality of red variant of the Portuguese [Vinho Verde] (http://www.vinhoverde.pt/en/about-vinho-verde) wine. It is based on sensory scores (median of at least 3 evaluations made by wine experts). Each expert graded the wine quality between 0 (very bad) and 10 (very excellent).

Loading and Processing the Raw Data

The dataset is available at: (https://archive.ics.uci.edu/ml/datasets/Wine+Quality)

Reading in the data

We first read in the data from the raw text file included in the zip archive. The data is a delimited file were fields are delimited with the ; character.

df_red<-read.csv("./data/winequality-red.csv", sep = ';')
df_red <- na.omit(df_red) ## Omit NAs

After reading in the data we check the first few rows (there are 1,599) rows in this dataset. $x^2$

## [1] 1599   12

Remove outliers using Inter-Quartile distance (IQR)

H-spread, is a measure of statistical dispersion, being equal to the difference between 75th and 25th percentiles

library(dplyr)
for (i in 1:11) {
  quantiles<-quantile(df_red[,i])
  iqr<-quantiles[[3]] - quantiles[[1]]
  inside_range = c(quantiles[[1]]-3*iqr, quantiles[[3]]+3*iqr)
  df_red<-df_red %>% filter(df_red[,i]>=inside_range[1] & df_red[,i]<=inside_range[2])
}

Split data into 80-20 Training and Test set ratios

row_num<-floor(nrow(df_red)*.8)
wine_train<-df_red[sample(nrow(df_red), row_num),]
wine_train[,"quality_factors"] = as.factor(wine_train$quality)

row_num.test<-floor(nrow(df_red)*.2)
wine_test<-df_red[sample(nrow(df_red), row_num.test),]

Study of Correlation and Covariances

\[ \begin{align*} Cov(X,Y) &= E[(X-\bar X)(Y-\bar Y)] \\ &= \frac{1}{n-1}\sum_{i=1}^{n} (X_i-\bar X)(Y_i-\bar Y) \\ &= \frac{1}{n-1}((\sum_{i=1}^{n} X_iY_i)-n\bar X\bar Y) \end{align*} \]

Principal Components

Principal Components are the eigenvalues in the descending order of the eigenvectors of the covariance matrix.

Programming Details

We are working with matrices To center the data, first convert the wine dataframe to a matrix.

wine_mat<-as.matrix(wine_train[c(1:11)])
  1. create a matrix of means of rows and columns : 1188, 11 respectively

\[ \begin{align*} Mean Mat &= \begin{bmatrix} 1 \\ 1 \\ \vdots \\ 1 \\ \end{bmatrix} \times [\bar x_1 \bar x_2 ...\bar x_{11} ] \end{align*} \]

nrow <- nrow(wine_mat)
ncol <- ncol(wine_mat)
mean_mat = matrix(1,nrow = nrow, ncol = 1) %*% apply(wine_mat, 2, mean)
  1. Create a Difference matrix $DiffMat of the values and the Mean \[ \begin{align*} DiffMat &= WineMat-MeanMat \end{align*} \]
diff_mat = wine_mat - mean_mat
  1. Create a covariance matrix \(Cov\) by multiplying the transposed difference matrix \(DiffMat^T\) with \(DiffMat\) and dividing by \(n-1\), the degrees of freedom \[ \begin{align*} Covar &= \frac {DiffMat \times DiffMat^T} {(n-1)} \end{align*} \]
    fixed.acidity volatile.acidity citric.acid residual.sugar chlorides free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
    fixed.acidity 2.902 -0.085 0.227 0.212 0.006 -2.201 -5.965 0.002 -0.189 0.047 -0.176
    volatile.acidity -0.085 0.033 -0.020 0.008 0.001 -0.074 0.289 0.000 0.007 -0.008 -0.033
    citric.acid 0.227 -0.020 0.037 0.021 0.000 -0.122 0.069 0.000 -0.016 0.008 0.019
    residual.sugar 0.212 0.008 0.021 0.632 0.002 0.005 2.435 0.001 -0.007 0.007 0.076
    chlorides 0.006 0.001 0.000 0.002 0.001 -0.025 0.019 0.000 -0.001 0.000 -0.006
    free.sulfur.dioxide -2.201 -0.074 -0.122 0.005 -0.025 86.754 167.527 -0.001 0.149 0.072 -0.094
    total.sulfur.dioxide -5.965 0.289 0.069 2.435 0.019 167.527 783.574 0.003 0.055 0.098 -5.465
    density 0.002 0.000 0.000 0.001 0.000 -0.001 0.003 0.000 0.000 0.000 -0.001
    pH -0.189 0.007 -0.016 -0.007 -0.001 0.149 0.055 0.000 0.024 -0.002 0.032
    sulphates 0.047 -0.008 0.008 0.007 0.000 0.072 0.098 0.000 -0.002 0.020 0.027
    alcohol -0.176 -0.033 0.019 0.076 -0.006 -0.094 -5.465 -0.001 0.032 0.027 1.102

Scale the data so that the mean is zero and that the standard deviation is one

We can re-use the \(DiffMat\) as that is centered data. However, we have to scale the data by dividing by the standard deviations 1. Compute Standard Deviation using this formula \[ \begin{align*} StdDev(X) &= \sqrt{Variance(X)}\\ &=\sqrt { \sum_{i=1}^{11}\frac {\big( X_{i}-\bar X\big)^2} {(n-1)}} \end{align*} \] Standardized data is, \[ \begin{align*} ScaleCentered(X) &= \frac{X-\mu}\sigma \end{align*} \]

sd_mat = apply(wine_mat, 2, sd)
sd_centered <- matrix(nrow= nrow(diff_mat), ncol = ncol(diff_mat) )
sd_center <- function(x,y){
  col_len = ncol(x)
 
  for(i in 1:col_len){
    sd_centered[,i] <<- x[,i]/y[i]
  }

}
sd_center(diff_mat,sd_mat)
wine.pr_manual <- prcomp(sd_centered)
pcr_fromManual<-summary(wine.pr_manual)
wine.pr <- prcomp(wine_mat, scale. = TRUE,center = TRUE)
pcr<-summary(wine.pr)

Scree Plot

Create a line plot of eigenvalues

library(gridExtra)
library(ggplot2)
library(factoextra)
scree_plot<-fviz_eig(wine.pr)

pcr_df<-as.data.frame(pcr$x)
pcr_sd<-apply(pcr_df, 2, sd)
#colnames(pcr_sd)<-colnames(pcr_df)
pcr_sd <- sort(pcr_sd, decreasing=TRUE)

tt <- ttheme_default(core=list(fg_params=list(hjust=0, x=0.1)),
                      rowhead=list(fg_params=list(hjust=0, x=0)))
plot1<-qplot(y=pcr$sdev, xlab = "Principal Components", ylab = "St.Dev.EigenValues") + theme( axis.text.x = element_text(angle = 90))

prop.pca = wine.pr$sdev^2/sum(wine.pr$sdev^2)
percent_cumulative<-(cumsum(prop.pca))*100


plot2<- qplot(y=prop.pca, xlab = "Principal Components", ylab = "VAR.% Eigenvalues")+ theme( axis.text.x = element_text(angle = 90))

plot3 <- plot(percent_cumulative,xlab = "Principal Components", ylab = "Cumulative VAR% Eigenvalues") + theme( axis.text.x = element_text(angle = 90)) 
abline(v = 7, col="blue", lty=5) 
abline(h = percent_cumulative[7], col="blue", lty=5) 
legend("topleft", legend=paste("Cut-off @ PC7 for %Var = ",round(percent_cumulative[7],2)),
       col=c("blue"), lty=5, cex=0.6)

#t <- tableGrob(data_moments_table[,i], theme = tt)
#par(mfrow = c(2, 2))
grid.arrange(grobs = list(scree_plot, plot1, plot2), nrow=2)

Model Fitting using Principal Component Regression

## Principal component analysis

## Working with the principal components
library("factoextra")
fviz_pca_ind(wine.pr, geom.ind = "point", pointshape = 21, 
             pointsize = 2, 
             fill.ind = wine_train$quality_factors, 
             col.ind = "black", 
             palette = "jco", 
             addEllipses = TRUE,
             label = "var",
             col.var = "black",
             repel = TRUE,
             legend.title = "Quality") +
  ggtitle("2D PCA-plot from 11 feature dataset") +
  theme(plot.title = element_text(hjust = 0.5))

prop.pca = wine.pr$sdev^2/sum(wine.pr$sdev^2)


dataset.pca = data.frame(quality = wine_train[,"quality_factors"],
                     pca = wine.pr$x )

Generate the test-set in the Principal Component space Project the test-set onto the original PCA

#wine_mat %*% wine.pr$rotation == wine.pr$x # should be TRUE
wine_test_mat<-as.matrix(wine_test[c(1:11)])

#head(predict(wine.pr, newdata = wine_test_mat) == wine_test_mat %*% wine.pr$rotation)
pca_scores_test =wine_test_mat %*% wine.pr$rotation

form <- "wine_test[,12] ~ pca_scores_test[,1]"
for (i in 2:7){
 form <- paste(form,"+ pca_scores_test[,",i,"]")
}
model.pca=glm(as.formula(form))

predicted_quality<- predict(model.pca)
p1 <- ggplot() + geom_point(aes(y = predicted_quality, x = wine_test$quality,fill=wine_test$quality), xlab = "Observed Quality", ylab = "Predicted Quality", main = "Principal Component Regression")

#find optimal cut-off
fitted.quality<- round(predicted_quality)

misClasificError <- mean(fitted.quality != wine_test$quality)
###If quality were numeric then the misclassification error is the numeric distance from the correct quality.
linear_reg<-lm(predicted_quality~wine_test$quality)

Confusion Matrix from PCA Regression

library(kableExtra)
tb_pca<-table(Predicted=fitted.quality ,Actual = wine_test$quality)
kable(tb_pca) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),full_width = F)%>%
  column_spec(1, bold = T, border_right = T) %>%
  add_header_above(c( "Actual Quality" = 6)) %>% kable_styling()
Actual Quality
3 4 5 6 7
5 1 3 82 37 1
6 0 4 40 95 32
7 0 0 1 1 0

Misclassification Error = 0.4040404

======

LDA (Linear Discriminant Analysis)

Deep discussion on LDA

Based on Bayes Decision Theory * Given Information + prior probabilities of Wine Quality + Conditional Probabilities of Each Variable Consider these prior probabilities

 wine_prior<-wine_train %>% group_by(quality_factors) %>% summarise(count = n(), prior_prob = round(n()/nrow(wine_train),3))
knitr::kable(wine_prior) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", full_width = F))
quality_factors count prior_prob
3 6 0.005
4 40 0.034
5 498 0.419
6 481 0.405
7 148 0.125
8 15 0.013
  • conditional probabilities of each variable
ggplot(wine_train, aes(x = pH)) +
  geom_density(aes(fill = quality_factors, alpha=0.5))

While the Principal Component Analysis maximises the variances, the LDA maximises the inter-class separation. LDA is also a supervised learning technique. So there’s a training set in which the posterior probabilities are learnt.

library(MASS)
require(scales)
f <- paste(names(wine_train)[13], "~", paste(names(wine_train)[-c(12,13)], collapse=" + "))
wine_train.lda <- lda(as.formula(paste(f)), data = wine_train)

wine_test.lda.predict <- predict(wine_train.lda, newdata = wine_test,interval = "prediction")

The SVD gives the ratio of the between- and within-group standard deviations on the linear discriminant variables.

library(gridExtra)
wine_test[,"quality_factors"] = as.factor(wine_test$quality)
wine_train[,"quality_factors"] = as.factor(wine_train$quality)
dataset = data.frame(quality = wine_test[,"quality_factors"],
                    lda = wine_test.lda.predict$x )
prop.lda = wine_train.lda$svd^2/sum(wine_train.lda$svd^2)

p1 <- ggplot(dataset) + geom_point(aes(lda.LD1, lda.LD2, colour = quality, shape = quality), size = 2.5) + 
  labs(x = paste("LD1 (", percent(prop.lda[1]), ")", sep=""),
       y = paste("LD2 (", percent(prop.lda[2]), ")", sep=""))

p2 <- ggplot(dataset.pca) + geom_point(aes(pca.PC1, pca.PC2, colour = quality, shape = quality), size = 2.5) +
  labs(x = paste("PC1 (", percent(prop.pca[1]), ")", sep=""),
       y = paste("PC2 (", percent(prop.pca[2]), ")", sep=""))

grid.arrange(p1, p2)

Clearly the LDA technique should be used for better predictions. Let’s calculate the posterior classification probabilities and the decision boundary

colorfun <- function(n,l=65,c=100) { hues = seq(15, 375, length=n+1); hcl(h=hues, l=l, c=c)[1:n] } # default ggplot2 colours
colors <- colorfun(length(levels(wine_test.lda.predict$class)))
colorslight <- colorfun(length(levels(wine_test.lda.predict$class)),l=90,c=50)

datPred <- data.frame(Quality=wine_test.lda.predict$class,wine_test.lda.predict$x)

#Create decision boundaries
fit2 <- lda(Quality ~ LD1 + LD2, data=datPred)
ld1lim <- expand_range(c(min(datPred$LD1),max(datPred$LD1)),mul=0.05)
ld2lim <- expand_range(c(min(datPred$LD2),max(datPred$LD2)),mul=0.05)
ld1 <- seq(ld1lim[[1]], ld1lim[[2]], length.out=300)
ld2 <- seq(ld2lim[[1]], ld1lim[[2]], length.out=300)
newdat <- expand.grid(list(LD1=ld1,LD2=ld2))

preds <-predict(fit2,newdata=newdat)
predclass <- preds$class
postprob <- preds$posterior
df <- data.frame(x=newdat$LD1, y=newdat$LD2, class=predclass)
df$classnum <- as.numeric(df$class)

datPred$maxProb <- apply(predict(wine_train.lda, newdata = wine_test)$posterior, 1, max)
ggplot(datPred, aes(x=LD1, y=LD2) ) +
  geom_raster(data=df, aes(x=x, y=y, fill = factor(class)),alpha=0.7,show.legend =FALSE) +
  geom_contour(data=df, aes(x=x, y=y, z=classnum), colour="red2", alpha=0.5, breaks=c(1.5,2.5)) +
  geom_point(data = datPred, size = 3, aes(pch = Quality,  colour=Quality, alpha = maxProb)) +
  scale_x_continuous(limits = ld1lim, expand=c(0,0)) +
  scale_y_continuous(limits = ld2lim, expand=c(0,0)) +
  scale_fill_manual(values=colorslight, guide=F)

Confusion Matrix from LDA

library(kableExtra)
tb<-table(Predicted=wine_test.lda.predict$class,Actual = wine_test$quality_factors)
kable(tb) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),full_width = F)%>%
  column_spec(1, bold = T, border_right = T) %>%
  add_header_above(c( "Actual Quality" = 6))
Actual Quality
3 4 5 6 7
3 0 0 0 0 0
4 0 0 1 3 0
5 0 6 59 57 15
6 1 1 48 63 15
7 0 0 15 10 3
8 0 0 0 0 0

LDA Accuracy = 0.4208754

======