In this report we aim to study the various factors that influence the Quality of red variant of the Portuguese [Vinho Verde] (http://www.vinhoverde.pt/en/about-vinho-verde) wine. It is based on sensory scores (median of at least 3 evaluations made by wine experts). Each expert graded the wine quality between 0 (very bad) and 10 (very excellent).
The dataset is available at: (https://archive.ics.uci.edu/ml/datasets/Wine+Quality)
We first read in the data from the raw text file included in the zip archive. The data is a delimited file were fields are delimited with the ; character.
df_red<-read.csv("./data/winequality-red.csv", sep = ';')
df_red <- na.omit(df_red) ## Omit NAs
After reading in the data we check the first few rows (there are 1,599) rows in this dataset. $x^2$
## [1] 1599 12
H-spread, is a measure of statistical dispersion, being equal to the difference between 75th and 25th percentiles
library(dplyr)
for (i in 1:11) {
quantiles<-quantile(df_red[,i])
iqr<-quantiles[[3]] - quantiles[[1]]
inside_range = c(quantiles[[1]]-3*iqr, quantiles[[3]]+3*iqr)
df_red<-df_red %>% filter(df_red[,i]>=inside_range[1] & df_red[,i]<=inside_range[2])
}
row_num<-floor(nrow(df_red)*.8)
wine_train<-df_red[sample(nrow(df_red), row_num),]
wine_train[,"quality_factors"] = as.factor(wine_train$quality)
row_num.test<-floor(nrow(df_red)*.2)
wine_test<-df_red[sample(nrow(df_red), row_num.test),]
\[ \begin{align*} Cov(X,Y) &= E[(X-\bar X)(Y-\bar Y)] \\ &= \frac{1}{n-1}\sum_{i=1}^{n} (X_i-\bar X)(Y_i-\bar Y) \\ &= \frac{1}{n-1}((\sum_{i=1}^{n} X_iY_i)-n\bar X\bar Y) \end{align*} \]
Principal Components are the eigenvalues in the descending order of the eigenvectors of the covariance matrix.
We are working with matrices To center the data, first convert the wine dataframe to a matrix.
wine_mat<-as.matrix(wine_train[c(1:11)])
\[ \begin{align*} Mean Mat &= \begin{bmatrix} 1 \\ 1 \\ \vdots \\ 1 \\ \end{bmatrix} \times [\bar x_1 \bar x_2 ...\bar x_{11} ] \end{align*} \]
nrow <- nrow(wine_mat)
ncol <- ncol(wine_mat)
mean_mat = matrix(1,nrow = nrow, ncol = 1) %*% apply(wine_mat, 2, mean)
diff_mat = wine_mat - mean_mat
| fixed.acidity | volatile.acidity | citric.acid | residual.sugar | chlorides | free.sulfur.dioxide | total.sulfur.dioxide | density | pH | sulphates | alcohol | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| fixed.acidity | 2.902 | -0.085 | 0.227 | 0.212 | 0.006 | -2.201 | -5.965 | 0.002 | -0.189 | 0.047 | -0.176 |
| volatile.acidity | -0.085 | 0.033 | -0.020 | 0.008 | 0.001 | -0.074 | 0.289 | 0.000 | 0.007 | -0.008 | -0.033 |
| citric.acid | 0.227 | -0.020 | 0.037 | 0.021 | 0.000 | -0.122 | 0.069 | 0.000 | -0.016 | 0.008 | 0.019 |
| residual.sugar | 0.212 | 0.008 | 0.021 | 0.632 | 0.002 | 0.005 | 2.435 | 0.001 | -0.007 | 0.007 | 0.076 |
| chlorides | 0.006 | 0.001 | 0.000 | 0.002 | 0.001 | -0.025 | 0.019 | 0.000 | -0.001 | 0.000 | -0.006 |
| free.sulfur.dioxide | -2.201 | -0.074 | -0.122 | 0.005 | -0.025 | 86.754 | 167.527 | -0.001 | 0.149 | 0.072 | -0.094 |
| total.sulfur.dioxide | -5.965 | 0.289 | 0.069 | 2.435 | 0.019 | 167.527 | 783.574 | 0.003 | 0.055 | 0.098 | -5.465 |
| density | 0.002 | 0.000 | 0.000 | 0.001 | 0.000 | -0.001 | 0.003 | 0.000 | 0.000 | 0.000 | -0.001 |
| pH | -0.189 | 0.007 | -0.016 | -0.007 | -0.001 | 0.149 | 0.055 | 0.000 | 0.024 | -0.002 | 0.032 |
| sulphates | 0.047 | -0.008 | 0.008 | 0.007 | 0.000 | 0.072 | 0.098 | 0.000 | -0.002 | 0.020 | 0.027 |
| alcohol | -0.176 | -0.033 | 0.019 | 0.076 | -0.006 | -0.094 | -5.465 | -0.001 | 0.032 | 0.027 | 1.102 |
We can re-use the \(DiffMat\) as that is centered data. However, we have to scale the data by dividing by the standard deviations 1. Compute Standard Deviation using this formula \[ \begin{align*} StdDev(X) &= \sqrt{Variance(X)}\\ &=\sqrt { \sum_{i=1}^{11}\frac {\big( X_{i}-\bar X\big)^2} {(n-1)}} \end{align*} \] Standardized data is, \[ \begin{align*} ScaleCentered(X) &= \frac{X-\mu}\sigma \end{align*} \]
sd_mat = apply(wine_mat, 2, sd)
sd_centered <- matrix(nrow= nrow(diff_mat), ncol = ncol(diff_mat) )
sd_center <- function(x,y){
col_len = ncol(x)
for(i in 1:col_len){
sd_centered[,i] <<- x[,i]/y[i]
}
}
sd_center(diff_mat,sd_mat)
wine.pr_manual <- prcomp(sd_centered)
pcr_fromManual<-summary(wine.pr_manual)
wine.pr <- prcomp(wine_mat, scale. = TRUE,center = TRUE)
pcr<-summary(wine.pr)
Create a line plot of eigenvalues
library(gridExtra)
library(ggplot2)
library(factoextra)
scree_plot<-fviz_eig(wine.pr)
pcr_df<-as.data.frame(pcr$x)
pcr_sd<-apply(pcr_df, 2, sd)
#colnames(pcr_sd)<-colnames(pcr_df)
pcr_sd <- sort(pcr_sd, decreasing=TRUE)
tt <- ttheme_default(core=list(fg_params=list(hjust=0, x=0.1)),
rowhead=list(fg_params=list(hjust=0, x=0)))
plot1<-qplot(y=pcr$sdev, xlab = "Principal Components", ylab = "St.Dev.EigenValues") + theme( axis.text.x = element_text(angle = 90))
prop.pca = wine.pr$sdev^2/sum(wine.pr$sdev^2)
percent_cumulative<-(cumsum(prop.pca))*100
plot2<- qplot(y=prop.pca, xlab = "Principal Components", ylab = "VAR.% Eigenvalues")+ theme( axis.text.x = element_text(angle = 90))
plot3 <- plot(percent_cumulative,xlab = "Principal Components", ylab = "Cumulative VAR% Eigenvalues") + theme( axis.text.x = element_text(angle = 90))
abline(v = 7, col="blue", lty=5)
abline(h = percent_cumulative[7], col="blue", lty=5)
legend("topleft", legend=paste("Cut-off @ PC7 for %Var = ",round(percent_cumulative[7],2)),
col=c("blue"), lty=5, cex=0.6)
#t <- tableGrob(data_moments_table[,i], theme = tt)
#par(mfrow = c(2, 2))
grid.arrange(grobs = list(scree_plot, plot1, plot2), nrow=2)
## Principal component analysis
## Working with the principal components
library("factoextra")
fviz_pca_ind(wine.pr, geom.ind = "point", pointshape = 21,
pointsize = 2,
fill.ind = wine_train$quality_factors,
col.ind = "black",
palette = "jco",
addEllipses = TRUE,
label = "var",
col.var = "black",
repel = TRUE,
legend.title = "Quality") +
ggtitle("2D PCA-plot from 11 feature dataset") +
theme(plot.title = element_text(hjust = 0.5))
prop.pca = wine.pr$sdev^2/sum(wine.pr$sdev^2)
dataset.pca = data.frame(quality = wine_train[,"quality_factors"],
pca = wine.pr$x )
Generate the test-set in the Principal Component space Project the test-set onto the original PCA
#wine_mat %*% wine.pr$rotation == wine.pr$x # should be TRUE
wine_test_mat<-as.matrix(wine_test[c(1:11)])
#head(predict(wine.pr, newdata = wine_test_mat) == wine_test_mat %*% wine.pr$rotation)
pca_scores_test =wine_test_mat %*% wine.pr$rotation
form <- "wine_test[,12] ~ pca_scores_test[,1]"
for (i in 2:7){
form <- paste(form,"+ pca_scores_test[,",i,"]")
}
model.pca=glm(as.formula(form))
predicted_quality<- predict(model.pca)
p1 <- ggplot() + geom_point(aes(y = predicted_quality, x = wine_test$quality,fill=wine_test$quality), xlab = "Observed Quality", ylab = "Predicted Quality", main = "Principal Component Regression")
#find optimal cut-off
fitted.quality<- round(predicted_quality)
misClasificError <- mean(fitted.quality != wine_test$quality)
###If quality were numeric then the misclassification error is the numeric distance from the correct quality.
linear_reg<-lm(predicted_quality~wine_test$quality)
library(kableExtra)
tb_pca<-table(Predicted=fitted.quality ,Actual = wine_test$quality)
kable(tb_pca) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),full_width = F)%>%
column_spec(1, bold = T, border_right = T) %>%
add_header_above(c( "Actual Quality" = 6)) %>% kable_styling()
| 3 | 4 | 5 | 6 | 7 | |
|---|---|---|---|---|---|
| 5 | 1 | 3 | 82 | 37 | 1 |
| 6 | 0 | 4 | 40 | 95 | 32 |
| 7 | 0 | 0 | 1 | 1 | 0 |
Misclassification Error = 0.4040404
======
Based on Bayes Decision Theory * Given Information + prior probabilities of Wine Quality + Conditional Probabilities of Each Variable Consider these prior probabilities
wine_prior<-wine_train %>% group_by(quality_factors) %>% summarise(count = n(), prior_prob = round(n()/nrow(wine_train),3))
knitr::kable(wine_prior) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", full_width = F))
| quality_factors | count | prior_prob |
|---|---|---|
| 3 | 6 | 0.005 |
| 4 | 40 | 0.034 |
| 5 | 498 | 0.419 |
| 6 | 481 | 0.405 |
| 7 | 148 | 0.125 |
| 8 | 15 | 0.013 |
ggplot(wine_train, aes(x = pH)) +
geom_density(aes(fill = quality_factors, alpha=0.5))
While the Principal Component Analysis maximises the variances, the LDA maximises the inter-class separation. LDA is also a supervised learning technique. So there’s a training set in which the posterior probabilities are learnt.
library(MASS)
require(scales)
f <- paste(names(wine_train)[13], "~", paste(names(wine_train)[-c(12,13)], collapse=" + "))
wine_train.lda <- lda(as.formula(paste(f)), data = wine_train)
wine_test.lda.predict <- predict(wine_train.lda, newdata = wine_test,interval = "prediction")
The SVD gives the ratio of the between- and within-group standard deviations on the linear discriminant variables.
library(gridExtra)
wine_test[,"quality_factors"] = as.factor(wine_test$quality)
wine_train[,"quality_factors"] = as.factor(wine_train$quality)
dataset = data.frame(quality = wine_test[,"quality_factors"],
lda = wine_test.lda.predict$x )
prop.lda = wine_train.lda$svd^2/sum(wine_train.lda$svd^2)
p1 <- ggplot(dataset) + geom_point(aes(lda.LD1, lda.LD2, colour = quality, shape = quality), size = 2.5) +
labs(x = paste("LD1 (", percent(prop.lda[1]), ")", sep=""),
y = paste("LD2 (", percent(prop.lda[2]), ")", sep=""))
p2 <- ggplot(dataset.pca) + geom_point(aes(pca.PC1, pca.PC2, colour = quality, shape = quality), size = 2.5) +
labs(x = paste("PC1 (", percent(prop.pca[1]), ")", sep=""),
y = paste("PC2 (", percent(prop.pca[2]), ")", sep=""))
grid.arrange(p1, p2)
Clearly the LDA technique should be used for better predictions. Let’s calculate the posterior classification probabilities and the decision boundary
colorfun <- function(n,l=65,c=100) { hues = seq(15, 375, length=n+1); hcl(h=hues, l=l, c=c)[1:n] } # default ggplot2 colours
colors <- colorfun(length(levels(wine_test.lda.predict$class)))
colorslight <- colorfun(length(levels(wine_test.lda.predict$class)),l=90,c=50)
datPred <- data.frame(Quality=wine_test.lda.predict$class,wine_test.lda.predict$x)
#Create decision boundaries
fit2 <- lda(Quality ~ LD1 + LD2, data=datPred)
ld1lim <- expand_range(c(min(datPred$LD1),max(datPred$LD1)),mul=0.05)
ld2lim <- expand_range(c(min(datPred$LD2),max(datPred$LD2)),mul=0.05)
ld1 <- seq(ld1lim[[1]], ld1lim[[2]], length.out=300)
ld2 <- seq(ld2lim[[1]], ld1lim[[2]], length.out=300)
newdat <- expand.grid(list(LD1=ld1,LD2=ld2))
preds <-predict(fit2,newdata=newdat)
predclass <- preds$class
postprob <- preds$posterior
df <- data.frame(x=newdat$LD1, y=newdat$LD2, class=predclass)
df$classnum <- as.numeric(df$class)
datPred$maxProb <- apply(predict(wine_train.lda, newdata = wine_test)$posterior, 1, max)
ggplot(datPred, aes(x=LD1, y=LD2) ) +
geom_raster(data=df, aes(x=x, y=y, fill = factor(class)),alpha=0.7,show.legend =FALSE) +
geom_contour(data=df, aes(x=x, y=y, z=classnum), colour="red2", alpha=0.5, breaks=c(1.5,2.5)) +
geom_point(data = datPred, size = 3, aes(pch = Quality, colour=Quality, alpha = maxProb)) +
scale_x_continuous(limits = ld1lim, expand=c(0,0)) +
scale_y_continuous(limits = ld2lim, expand=c(0,0)) +
scale_fill_manual(values=colorslight, guide=F)
library(kableExtra)
tb<-table(Predicted=wine_test.lda.predict$class,Actual = wine_test$quality_factors)
kable(tb) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),full_width = F)%>%
column_spec(1, bold = T, border_right = T) %>%
add_header_above(c( "Actual Quality" = 6))
| 3 | 4 | 5 | 6 | 7 | |
|---|---|---|---|---|---|
| 3 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 0 | 1 | 3 | 0 |
| 5 | 0 | 6 | 59 | 57 | 15 |
| 6 | 1 | 1 | 48 | 63 | 15 |
| 7 | 0 | 0 | 15 | 10 | 3 |
| 8 | 0 | 0 | 0 | 0 | 0 |
LDA Accuracy = 0.4208754
======