Importing packages and reading in CSVs

#Import packages
library(tidyverse)
library(car)
library(caret)
library(ROCR)
library(corrr)
library(cowplot)
library(arm)
library(stargazer)
#Read in the CSV, check data types. 
wine <-read.csv("C:/Users/qures/Downloads/WineQT.csv",header=TRUE)
str(wine)
#Removing column "Id"
wine <- subset(wine,select=-Id)

Plotting independent variables against the dependent variable

#Plots

#Quality
qualityplot<-ggplot(data=wine,aes(x=quality))+geom_histogram(color="black", fill="lightblue")+ggtitle("Quality")+xlab("Quality") +
ylab("Number of Wines")+ theme(plot.title = element_text(size=10, face = "bold"))



#Fixed acidity 
acidplot<-ggplot(data=wine,aes(x=fixed.acidity,y=quality))+geom_point()+ggtitle("Fixed acidity")+ theme(plot.title = element_text(size=10, face = "bold"))


#Volatile acidity 
volacidplot<-ggplot(data=wine,aes(x=volatile.acidity,y=quality))+geom_point()+ggtitle("Volatile acidity") +theme(plot.title = element_text(size=10, face = "bold"))

#Citric acid  
citricacidplot<-ggplot(data=wine,aes(x=citric.acid,y=quality))+geom_point()+ggtitle("Citric acid") +theme(plot.title = element_text(size=10, face = "bold"))


#Residual sugar 
residplot<-ggplot(data=wine,aes(x=residual.sugar,y=quality))+geom_point()+ggtitle("Residual sugar")+ theme(plot.title = element_text(size=10, face = "bold"))


#Chlorides
chloridesplot<-ggplot(data=wine,aes(x=chlorides,y=quality))+geom_point()+ggtitle("Chlorides")+ theme(plot.title = element_text(size=10, face = "bold"))


#FreeSulfur
freesulfplot<-ggplot(data=wine,aes(x=free.sulfur.dioxide,y=quality))+geom_point()+ggtitle("Free SO2") +theme(plot.title = element_text(size=10, face = "bold"))


#TotalSulfur
totalsulfplot<-ggplot(data=wine,aes(x=total.sulfur.dioxide,y=quality))+geom_point()+ggtitle("Total SO2")+ theme(plot.title = element_text(size=10, face = "bold"))


#Density 
totaldensplot<-ggplot(data=wine,aes(x=density,y=quality))+geom_point()+ggtitle("Density") +theme(plot.title = element_text(size=10, face = "bold"))


#Sulphates 
sulphatesplot<-ggplot(data=wine,aes(x=sulphates,y=quality))+geom_point()+ggtitle("Sulphates")+ theme(plot.title = element_text(size=10, face = "bold"))


#pH 
pHplot<- ggplot(data=wine,aes(x=pH,y=quality))+geom_point()+ggtitle("pH")+ theme(plot.title = element_text(size=10, face = "bold"))


#Alcohol
alcoholplot<-ggplot(data=wine,aes(x=alcohol,y=quality))+geom_point()+ggtitle("Alcohol")+ theme(plot.title = element_text(size=10, face = "bold"))


#All plots combined
plot_grid(acidplot,volacidplot,citricacidplot,residplot,chloridesplot,freesulfplot,totalsulfplot,totaldensplot,sulphatesplot,pHplot,alcoholplot)

Plot of Dependent Variables

Variable Generation and Model Creation

#Binomial Logistic 

#Split quality into binary
wine$category[wine$quality <= 5] <- 0
wine$category[wine$quality > 5] <- 1

wine$category <- as.factor(wine$category)


#Create model 
model2 <- glm(category ~ .+total.sulfur.dioxide:sulphates+alcohol:residual.sugar-quality, data = wine, family=binomial(link = "logit"))

Model summary and diagnostics

#summary
stargazer(model2,type="text")
## 
## ==========================================================
##                                    Dependent variable:    
##                                ---------------------------
##                                         category          
## ----------------------------------------------------------
## fixed.acidity                             0.099           
##                                          (0.121)          
##                                                           
## volatile.acidity                        -3.526***         
##                                          (0.600)          
##                                                           
## citric.acid                             -1.867***         
##                                          (0.684)          
##                                                           
## residual.sugar                            0.769           
##                                          (0.481)          
##                                                           
## chlorides                                -3.779*          
##                                          (1.932)          
##                                                           
## free.sulfur.dioxide                       0.010           
##                                          (0.010)          
##                                                           
## total.sulfur.dioxide                     0.015*           
##                                          (0.008)          
##                                                           
## density                                  19.880           
##                                         (98.241)          
##                                                           
## pH                                       -0.788           
##                                          (0.888)          
##                                                           
## sulphates                               5.874***          
##                                          (0.986)          
##                                                           
## alcohol                                 1.215***          
##                                          (0.195)          
##                                                           
## total.sulfur.dioxide:sulphates          -0.042***         
##                                          (0.011)          
##                                                           
## residual.sugar:alcohol                   -0.078           
##                                          (0.048)          
##                                                           
## Constant                                 -30.973          
##                                         (96.293)          
##                                                           
## ----------------------------------------------------------
## Observations                              1,143           
## Log Likelihood                          -574.075          
## Akaike Inf. Crit.                       1,176.150         
## ==========================================================
## Note:                          *p<0.1; **p<0.05; ***p<0.01

We see that alcohol, citric acidity, and sulfates contribute most to the overall quality of the wine. A unit increase in the alcohol content is expected to increase the odds of a wine having high quality by 3.367, a unit increase in the citric acid is expected to increase the odds of a wine having high quality by 0.156, and a unit increase in the sulfate count is expected to increase the odds of a wine having high quality by 354.24. A unit increase in the interaction of total sulfur dioxide and sulphites is expected to increase the odds of a wine having high quality by 0.9859.

#Model diagnostic plots
plot(model2)

plot of chunk unnamed-chunk-1plot of chunk unnamed-chunk-1plot of chunk unnamed-chunk-1plot of chunk unnamed-chunk-1

Model Accuracy and Variable Importance

#Binned residuals plot 
binnedplot(fitted(model2), resid(model2,type="response"))

plot of chunk unnamed-chunk-1

By the Normal Q-Q plot, it can be assumed that the normality assumption is not being violated despite the slightly fat tails and outliers. Additionally, using the binned residual plot, we can say that the variance is constant.

#Model Accuracy 
predicted2 <- predict.glm(model2, type="response")
pred2 <- prediction(predicted2,wine$category)
perf2<- performance(pred2,"tpr","fpr")
plot(perf2,colorize=TRUE, main="ROC Curve")
abline(a=0, b=1)

plot of chunk unnamed-chunk-1

class_predictions2 <- ifelse(predicted2>0.5,"1","0")
class_predictions2<-factor(class_predictions2)
confusionMatrix(class_predictions2,wine$category)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 393 140
##          1 129 481
##                                         
##                Accuracy : 0.7647        
##                  95% CI : (0.739, 0.789)
##     No Information Rate : 0.5433        
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.5265        
##                                         
##  Mcnemar's Test P-Value : 0.5421        
##                                         
##             Sensitivity : 0.7529        
##             Specificity : 0.7746        
##          Pos Pred Value : 0.7373        
##          Neg Pred Value : 0.7885        
##              Prevalence : 0.4567        
##          Detection Rate : 0.3438        
##    Detection Prevalence : 0.4663        
##       Balanced Accuracy : 0.7637        
##                                         
##        'Positive' Class : 0             
## 
#Variable Importance
varImp(model2)
##                                  Overall
## fixed.acidity                  0.8198720
## volatile.acidity               5.8789761
## citric.acid                    2.7299811
## residual.sugar                 1.5997355
## chlorides                      1.9565793
## free.sulfur.dioxide            0.9453544
## total.sulfur.dioxide           1.9371083
## density                        0.2023616
## pH                             0.8867913
## sulphates                      5.9568550
## alcohol                        6.2212258
## total.sulfur.dioxide:sulphates 3.8078354
## residual.sugar:alcohol         1.6233714

Summary

Overall, we see that the most important variable in determining quality is the alcohol content. While sulfates do contribute significantly to the quality of the wine, however there is a limit to the amount of sulfates included in the wine. Over 50 ppm the concentration of sulfates becomes noticeable and a distraction from the quality of the wine. So when an individual wants to find a high quality wine, the most important factor would be the alcohol content. In theory, a wine with sulfates near 50ppm with a high alcohol by volume would be very high quality.