knitr::opts_chunk$set(echo = TRUE)
We are interested in understanding whether wine taste testers are actually judging the quality of wine based on its chemical properties.
In addition, we want to judge exclusively what makes a high quality wine, which means we are interested in a classification problem.
We know that alcohol content and residual sugar can be identified by the drinker more than any other chemical.
We hypothesis: 1) That high alcohol content can be classified as high quality wine, red or white, with a 95% confidence level. 2) That residual sugar can be classified as high quality wine among white wine with a 95% confidence level.
Beyond our hypotheses, we will look into the differences between red and white wine.
Previous research has found that the chemical properties of wine are not highly associated with the quality of wine (cite). However, we are looking at only classifying high quality wine as a dichotomous variable, and not as a linear variable, which may help us discover different findings.
When gathering the data, we combined two different datasets, one with white wine data and one with red wine data. We created a dichotomous variable that identifies if the other variables are applying to red or white wine. Wine quality, a variable that can range from 0 to 10, but ranges from 3 to 9 in our dataset. All values that are seven or higher we consider “high quality” wine, which takes a value of one while the all values below seven take a value of 0.
Our data had no missing data. We found, using box plots, that alcohol is a good variable for classification as it has little variation in high quality wine and little variation in low quality wine, but lots of variables between high quality wine and low quality wine. Sugar does not show the same strong distinction.
###########
#libraries
###########
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library(e1071)
library(gridExtra)
library(ggplot2)
library(GGally)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:e1071':
##
## impute
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
library(corrplot)
## corrplot 0.84 loaded
library(outliers)
library(magicfor)
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
library(rpart.plot)
## Loading required package: rpart
library(rpart)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:Hmisc':
##
## subplot
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(RColorBrewer)
###############################################
#importing data and checking for missing value
###############################################
setwd("C:\\Users\\Siebelm\\Documents\\3 GWU\\1 DATS 6101\\Project\\Final")
wine_white_df0 <- read.csv("winequality-white.csv", sep=",")
wine_red_df0 <- read.csv("winequality-red.csv", sep=",")
sum(is.na(wine_white_df0))
## [1] 0
sum(is.na(wine_red_df0))
## [1] 0
#No missing value found; good.
##############################
#summary of the wine_white_df0 data
##############################
str(wine_white_df0)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
## $ volatile.acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
## $ citric.acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
## $ residual.sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
## $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
## $ free.sulfur.dioxide : num 45 14 30 47 47 30 30 45 14 28 ...
## $ total.sulfur.dioxide: num 170 132 97 186 186 97 136 170 132 129 ...
## $ density : num 1.001 0.994 0.995 0.996 0.996 ...
## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
## $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
## $ quality : int 6 6 6 6 6 6 6 6 6 6 ...
summary(wine_white_df0)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700 1st Qu.: 1.700
## Median : 6.800 Median :0.2600 Median :0.3200 Median : 5.200
## Mean : 6.855 Mean :0.2782 Mean :0.3342 Mean : 6.391
## 3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3900 3rd Qu.: 9.900
## Max. :14.200 Max. :1.1000 Max. :1.6600 Max. :65.800
## chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. :0.00900 Min. : 2.00 Min. : 9.0
## 1st Qu.:0.03600 1st Qu.: 23.00 1st Qu.:108.0
## Median :0.04300 Median : 34.00 Median :134.0
## Mean :0.04577 Mean : 35.31 Mean :138.4
## 3rd Qu.:0.05000 3rd Qu.: 46.00 3rd Qu.:167.0
## Max. :0.34600 Max. :289.00 Max. :440.0
## density pH sulphates alcohol
## Min. :0.9871 Min. :2.720 Min. :0.2200 Min. : 8.00
## 1st Qu.:0.9917 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50
## Median :0.9937 Median :3.180 Median :0.4700 Median :10.40
## Mean :0.9940 Mean :3.188 Mean :0.4898 Mean :10.51
## 3rd Qu.:0.9961 3rd Qu.:3.280 3rd Qu.:0.5500 3rd Qu.:11.40
## Max. :1.0390 Max. :3.820 Max. :1.0800 Max. :14.20
## quality
## Min. :3.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.878
## 3rd Qu.:6.000
## Max. :9.000
head(wine_white_df0)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.0 0.27 0.36 20.7 0.045
## 2 6.3 0.30 0.34 1.6 0.049
## 3 8.1 0.28 0.40 6.9 0.050
## 4 7.2 0.23 0.32 8.5 0.058
## 5 7.2 0.23 0.32 8.5 0.058
## 6 8.1 0.28 0.40 6.9 0.050
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 45 170 1.0010 3.00 0.45 8.8
## 2 14 132 0.9940 3.30 0.49 9.5
## 3 30 97 0.9951 3.26 0.44 10.1
## 4 47 186 0.9956 3.19 0.40 9.9
## 5 47 186 0.9956 3.19 0.40 9.9
## 6 30 97 0.9951 3.26 0.44 10.1
## quality
## 1 6
## 2 6
## 3 6
## 4 6
## 5 6
## 6 6
tail(wine_white_df0)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 4893 6.5 0.23 0.38 1.3 0.032
## 4894 6.2 0.21 0.29 1.6 0.039
## 4895 6.6 0.32 0.36 8.0 0.047
## 4896 6.5 0.24 0.19 1.2 0.041
## 4897 5.5 0.29 0.30 1.1 0.022
## 4898 6.0 0.21 0.38 0.8 0.020
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
## 4893 29 112 0.99298 3.29 0.54
## 4894 24 92 0.99114 3.27 0.50
## 4895 57 168 0.99490 3.15 0.46
## 4896 30 111 0.99254 2.99 0.46
## 4897 20 110 0.98869 3.34 0.38
## 4898 22 98 0.98941 3.26 0.32
## alcohol quality
## 4893 9.7 5
## 4894 11.2 6
## 4895 9.6 5
## 4896 9.4 6
## 4897 12.8 7
## 4898 11.8 6
names(wine_white_df0)
## [1] "fixed.acidity" "volatile.acidity" "citric.acid"
## [4] "residual.sugar" "chlorides" "free.sulfur.dioxide"
## [7] "total.sulfur.dioxide" "density" "pH"
## [10] "sulphates" "alcohol" "quality"
dim(wine_white_df0)
## [1] 4898 12
View(wine_white_df0)
#############################
#summary of the wine_red_df0 data
############################
str(wine_red_df0)
## 'data.frame': 1599 obs. of 12 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
summary(wine_red_df0)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. :0.01200 Min. : 1.00 Min. : 6.00
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00
## Median :0.07900 Median :14.00 Median : 38.00
## Mean :0.08747 Mean :15.87 Mean : 46.47
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00
## Max. :0.61100 Max. :72.00 Max. :289.00
## density pH sulphates alcohol
## Min. :0.9901 Min. :2.740 Min. :0.3300 Min. : 8.40
## 1st Qu.:0.9956 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50
## Median :0.9968 Median :3.310 Median :0.6200 Median :10.20
## Mean :0.9967 Mean :3.311 Mean :0.6581 Mean :10.42
## 3rd Qu.:0.9978 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10
## Max. :1.0037 Max. :4.010 Max. :2.0000 Max. :14.90
## quality
## Min. :3.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.636
## 3rd Qu.:6.000
## Max. :8.000
head(wine_red_df0)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.4 0.70 0.00 1.9 0.076
## 2 7.8 0.88 0.00 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.70 0.00 1.9 0.076
## 6 7.4 0.66 0.00 1.8 0.075
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.20 0.68 9.8
## 3 15 54 0.9970 3.26 0.65 9.8
## 4 17 60 0.9980 3.16 0.58 9.8
## 5 11 34 0.9978 3.51 0.56 9.4
## 6 13 40 0.9978 3.51 0.56 9.4
## quality
## 1 5
## 2 5
## 3 5
## 4 6
## 5 5
## 6 5
tail(wine_red_df0)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1594 6.8 0.620 0.08 1.9 0.068
## 1595 6.2 0.600 0.08 2.0 0.090
## 1596 5.9 0.550 0.10 2.2 0.062
## 1597 6.3 0.510 0.13 2.3 0.076
## 1598 5.9 0.645 0.12 2.0 0.075
## 1599 6.0 0.310 0.47 3.6 0.067
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
## 1594 28 38 0.99651 3.42 0.82
## 1595 32 44 0.99490 3.45 0.58
## 1596 39 51 0.99512 3.52 0.76
## 1597 29 40 0.99574 3.42 0.75
## 1598 32 44 0.99547 3.57 0.71
## 1599 18 42 0.99549 3.39 0.66
## alcohol quality
## 1594 9.5 6
## 1595 10.5 5
## 1596 11.2 6
## 1597 11.0 6
## 1598 10.2 5
## 1599 11.0 6
names(wine_red_df0)
## [1] "fixed.acidity" "volatile.acidity" "citric.acid"
## [4] "residual.sugar" "chlorides" "free.sulfur.dioxide"
## [7] "total.sulfur.dioxide" "density" "pH"
## [10] "sulphates" "alcohol" "quality"
dim(wine_red_df0)
## [1] 1599 12
View(wine_red_df0)
#White wine has 4898 samples and 12 variables where as red-wine has 1599 samples and 12 variables. For both the data sets the response variable is quality. The eleven predicting variables are of numeric class and response variable is of the integer class.
table(wine_white_df0$quality)
##
## 3 4 5 6 7 8 9
## 20 163 1457 2198 880 175 5
table(wine_red_df0$quality)
##
## 3 4 5 6 7 8
## 10 53 681 638 199 18
#From the tables found, it is clear that there is very big class imbalance. For white-wine out of 4898 samples, only 20 are of the class 3 and only 5 are of the class 9. There are not enough samples of those classes to split the data into useable training and test sets.
#lets define the quality as binomail value "good" and "bad"; as if quality value is equal to and greater than 7 the wine is "good" and if quality value is smaller than 7 the wine is "bad".
quality.factor <- factor(wine_white_df0$quality)
quality.cat <- NA
quality.cat <- ifelse(wine_white_df0$quality>=7, 1, NA)
quality.cat <- ifelse(wine_white_df0$quality<=6, 0, quality.cat)
quality.cat <- factor(quality.cat, levels = c("0", "1"))
######################################################
#Draw boxplots for different variable of white-wine
######################################################
for (i in c(1:11)){
box = qplot(x=quality.cat, y = wine_white_df0[,i], data=wine_white_df0, geom="boxplot", ylab=names(wine_white_df0)[i], xlab= "Quality Catagory" )
grid.arrange(box, ncol=2)
}
###################################################
#Draw boxplots for different variables of red-wine
###################################################
quality.factor1 <- factor(wine_red_df0$quality)
quality.cat1 <- NA
quality.cat1 <- ifelse(wine_red_df0$quality>=7, 1, NA)
quality.cat1 <- ifelse(wine_red_df0$quality<=6, 0, quality.cat1)
quality.cat1 <- factor(quality.cat1, levels = c("0", "1"))
for (i in c(1:11)){
box1 = qplot(x=quality.cat1, y = wine_red_df0[,i], data=wine_red_df0, geom="boxplot", ylab=names(wine_red_df0)[i], xlab= "Quality Catagory" )
grid.arrange(box1, ncol=2)
}
#################################################################
#Finding the correlation between different variables in white-wine
#################################################################
ready <-cor(wine_white_df0)
corrplot(ready, type="upper", method="number", tl.srt=45, title="Correlation between different variables in wine_white_df0 ")
#lets have a look at how correlated our different variables are with the quality (simplified than corplot)
cor(x=wine_white_df0[,1:11], y=wine_white_df0$quality)
## [,1]
## fixed.acidity -0.113662831
## volatile.acidity -0.194722969
## citric.acid -0.009209091
## residual.sugar -0.097576829
## chlorides -0.209934411
## free.sulfur.dioxide 0.008158067
## total.sulfur.dioxide -0.174737218
## density -0.307123313
## pH 0.099427246
## sulphates 0.053677877
## alcohol 0.435574715
# "alcohol", "density", and "Chlorides" seems to be most influencing contents for the white-wine quality
################################################################
#Finding the correlation between different variables in red-wine
################################################################
ready1 <-cor(wine_red_df0)
corrplot(ready1, type="upper", method="number", tl.srt=45, title="Correlation between different variables in wine_red_df0 ")
#lets have a look at how correlated our different variables are with the quality (simplified than corplot)
cor(x=wine_red_df0[,1:11], y=wine_red_df0$quality)
## [,1]
## fixed.acidity 0.12405165
## volatile.acidity -0.39055778
## citric.acid 0.22637251
## residual.sugar 0.01373164
## chlorides -0.12890656
## free.sulfur.dioxide -0.05065606
## total.sulfur.dioxide -0.18510029
## density -0.17491923
## pH -0.05773139
## sulphates 0.25139708
## alcohol 0.47616632
# "alcohol", "volatile.acidity", "Sulphates", and "Citric.acid" seems to be most influencing contents for the wine_red_df0 quality
Our EDA process shows that alcohol may be a good predictor, but sugar might not. We also find that density has high correlations with other variables so we will check for multicollinearity in any modeling we do and likely drop it.
We tried KNN models and Decision Trees, but the model fit was not necessarily better than a logistic regression, the latter of which would give us better illustrative power.
# Since alcohol and residual sugar both have high correlation with the density, lets delete density to alleviate the multicollinearity.
wine_white_df0$density <- NULL
wine_red_df0$density <- NULL
#Dividing the wine_white_df0 data into training and test sets
#training set for white-wine
white_train <- wine_white_df0[1:3750, ]
#test set for white-wine
white_test <- wine_white_df0[3751:4898, ]
#Dividing the wine_red_df0 data into training and test sets
#train set for wine_red_df0
red_train <- wine_red_df0[1:1200, ]
#test set for wine_red_df0
red_test <- wine_red_df0[1201:1599, ]
#############################
# White-wine KNN model
############################
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 10)
set.seed(2)
whiteKnn <- train(quality ~., data = white_train, method = "knn", trControl = fitControl)
whiteKnn
## k-Nearest Neighbors
##
## 3750 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 3374, 3375, 3375, 3375, 3376, 3375, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 0.8338721 0.2031074 0.6471356
## 7 0.8354868 0.1878524 0.6600925
## 9 0.8328163 0.1854829 0.6605090
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 9.
#For wine_white_df0: Highest R-squared is ~ 0.2 with k=5, along with RMSE of 0.8361450 and MAE of 0.6483786
# Test KNN model on white wine test data
whitew_knnPredictions <- predict(whiteKnn, newdata = white_train)
r2_whitew_knn <- R2(whitew_knnPredictions, white_train$quality)
r2_whitew_knn
## [1] 0.3420206
rmse_whitew_knn <- RMSE(whitew_knnPredictions, white_train$quality)
rmse_whitew_knn
## [1] 0.7428622
###############################
#Red-wine KNN model
###############################
set.seed(2)
redKnn <- train(quality ~., data = red_train, method = "knn", trControl = fitControl)
redKnn
## k-Nearest Neighbors
##
## 1200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 1079, 1080, 1080, 1079, 1080, 1079, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 0.7221064 0.2306235 0.5486295
## 7 0.7167156 0.2303043 0.5544908
## 9 0.7185641 0.2204522 0.5595933
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 7.
#The highest R-squared is ~0.23 with k=5, along with RMSE of 0.7221064 and MAE of 0.5486295
# Test KNN model on white wine test data
redw_knnPredictions <- predict(redKnn, newdata = red_train)
r2_redw_knn <- R2(redw_knnPredictions, red_train$quality)
r2_redw_knn
## [1] 0.4300096
rmse_redw_knn <- RMSE(redw_knnPredictions, red_train$quality)
rmse_redw_knn
## [1] 0.6128408
####################################
# Decision tree model for wine_white_df0
####################################
m.rpart <-rpart(quality~., data=white_train)
m.rpart
## n= 3750
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 3750 3140.06000 5.886933
## 2) alcohol< 10.85 2473 1510.66200 5.609381
## 4) volatile.acidity>=0.2425 1406 740.15080 5.402560
## 8) volatile.acidity>=0.4225 182 92.99451 4.994505 *
## 9) volatile.acidity< 0.4225 1224 612.34560 5.463235 *
## 5) volatile.acidity< 0.2425 1067 631.12090 5.881912 *
## 3) alcohol>=10.85 1277 1069.95800 6.424432
## 6) free.sulfur.dioxide< 11.5 93 99.18280 5.473118 *
## 7) free.sulfur.dioxide>=11.5 1184 879.99920 6.499155
## 14) alcohol< 11.85 611 447.38130 6.296236 *
## 15) alcohol>=11.85 573 380.63180 6.715532 *
#Summary
summary(m.rpart)
## Call:
## rpart(formula = quality ~ ., data = white_train)
## n= 3750
##
## CP nsplit rel error xerror xstd
## 1 0.17816211 0 1.0000000 1.0001439 0.02387516
## 2 0.04439109 1 0.8218379 0.8267870 0.02249675
## 3 0.02890893 2 0.7774468 0.7908978 0.02226125
## 4 0.01655575 3 0.7485379 0.7620384 0.02113785
## 5 0.01108600 4 0.7319821 0.7491583 0.02067740
## 6 0.01000000 5 0.7208961 0.7461695 0.02060968
##
## Variable importance
## alcohol chlorides volatile.acidity
## 49 16 15
## total.sulfur.dioxide free.sulfur.dioxide sulphates
## 8 8 2
## pH fixed.acidity residual.sugar
## 1 1 1
##
## Node number 1: 3750 observations, complexity param=0.1781621
## mean=5.886933, MSE=0.8373493
## left son=2 (2473 obs) right son=3 (1277 obs)
## Primary splits:
## alcohol < 10.85 to the left, improve=0.17816210, (0 missing)
## chlorides < 0.0395 to the right, improve=0.08199995, (0 missing)
## total.sulfur.dioxide < 153.5 to the right, improve=0.03875440, (0 missing)
## free.sulfur.dioxide < 11.75 to the left, improve=0.03632119, (0 missing)
## citric.acid < 0.235 to the left, improve=0.02871882, (0 missing)
## Surrogate splits:
## chlorides < 0.0375 to the right, agree=0.773, adj=0.334, (0 split)
## total.sulfur.dioxide < 102.5 to the right, agree=0.705, adj=0.132, (0 split)
## sulphates < 0.345 to the right, agree=0.670, adj=0.031, (0 split)
## fixed.acidity < 5.25 to the right, agree=0.662, adj=0.009, (0 split)
## free.sulfur.dioxide < 4.5 to the right, agree=0.662, adj=0.008, (0 split)
##
## Node number 2: 2473 observations, complexity param=0.04439109
## mean=5.609381, MSE=0.6108623
## left son=4 (1406 obs) right son=5 (1067 obs)
## Primary splits:
## volatile.acidity < 0.2425 to the right, improve=0.09227123, (0 missing)
## free.sulfur.dioxide < 13.5 to the left, improve=0.04177240, (0 missing)
## alcohol < 10.15 to the left, improve=0.03313802, (0 missing)
## citric.acid < 0.205 to the left, improve=0.02721200, (0 missing)
## pH < 3.325 to the left, improve=0.01860335, (0 missing)
## Surrogate splits:
## total.sulfur.dioxide < 111.5 to the right, agree=0.610, adj=0.097, (0 split)
## pH < 3.295 to the left, agree=0.598, adj=0.067, (0 split)
## alcohol < 10.05 to the left, agree=0.590, adj=0.049, (0 split)
## sulphates < 0.715 to the left, agree=0.584, adj=0.037, (0 split)
## residual.sugar < 1.85 to the right, agree=0.581, adj=0.029, (0 split)
##
## Node number 3: 1277 observations, complexity param=0.02890893
## mean=6.424432, MSE=0.8378682
## left son=6 (93 obs) right son=7 (1184 obs)
## Primary splits:
## free.sulfur.dioxide < 11.5 to the left, improve=0.08484051, (0 missing)
## alcohol < 11.85 to the left, improve=0.06149941, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.04259695, (0 missing)
## residual.sugar < 1.275 to the left, improve=0.02795662, (0 missing)
## total.sulfur.dioxide < 67.5 to the left, improve=0.02541719, (0 missing)
## Surrogate splits:
## total.sulfur.dioxide < 48.5 to the left, agree=0.937, adj=0.14, (0 split)
##
## Node number 4: 1406 observations, complexity param=0.011086
## mean=5.40256, MSE=0.526423
## left son=8 (182 obs) right son=9 (1224 obs)
## Primary splits:
## volatile.acidity < 0.4225 to the right, improve=0.04703189, (0 missing)
## free.sulfur.dioxide < 17.5 to the left, improve=0.04607770, (0 missing)
## total.sulfur.dioxide < 86.5 to the left, improve=0.02894310, (0 missing)
## alcohol < 10.25 to the left, improve=0.02890077, (0 missing)
## chlorides < 0.0455 to the right, improve=0.02096635, (0 missing)
## Surrogate splits:
## citric.acid < 0.11 to the left, agree=0.873, adj=0.022, (0 split)
## fixed.acidity < 9.85 to the right, agree=0.873, adj=0.016, (0 split)
## chlorides < 0.206 to the right, agree=0.871, adj=0.005, (0 split)
##
## Node number 5: 1067 observations
## mean=5.881912, MSE=0.591491
##
## Node number 6: 93 observations
## mean=5.473118, MSE=1.066482
##
## Node number 7: 1184 observations, complexity param=0.01655575
## mean=6.499155, MSE=0.7432425
## left son=14 (611 obs) right son=15 (573 obs)
## Primary splits:
## alcohol < 11.85 to the left, improve=0.05907511, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.04400660, (0 missing)
## residual.sugar < 1.225 to the left, improve=0.02503936, (0 missing)
## pH < 3.245 to the left, improve=0.02417936, (0 missing)
## total.sulfur.dioxide < 191.5 to the right, improve=0.02307876, (0 missing)
## Surrogate splits:
## volatile.acidity < 0.2675 to the left, agree=0.665, adj=0.307, (0 split)
## chlorides < 0.0365 to the right, agree=0.631, adj=0.237, (0 split)
## total.sulfur.dioxide < 126.5 to the right, agree=0.566, adj=0.103, (0 split)
## residual.sugar < 1.525 to the left, agree=0.560, adj=0.091, (0 split)
## fixed.acidity < 6.25 to the right, agree=0.548, adj=0.066, (0 split)
##
## Node number 8: 182 observations
## mean=4.994505, MSE=0.5109588
##
## Node number 9: 1224 observations
## mean=5.463235, MSE=0.5002823
##
## Node number 14: 611 observations
## mean=6.296236, MSE=0.7322117
##
## Node number 15: 573 observations
## mean=6.715532, MSE=0.6642788
# Visualizing the decision tree
rpart.plot(m.rpart, digits = 3)
#lets see R-square for our medel
rsq.rpart(m.rpart)
##
## Regression tree:
## rpart(formula = quality ~ ., data = white_train)
##
## Variables actually used in tree construction:
## [1] alcohol free.sulfur.dioxide volatile.acidity
##
## Root node error: 3140.1/3750 = 0.83735
##
## n= 3750
##
## CP nsplit rel error xerror xstd
## 1 0.178162 0 1.00000 1.00014 0.023875
## 2 0.044391 1 0.82184 0.82679 0.022497
## 3 0.028909 2 0.77745 0.79090 0.022261
## 4 0.016556 3 0.74854 0.76204 0.021138
## 5 0.011086 4 0.73198 0.74916 0.020677
## 6 0.010000 5 0.72090 0.74617 0.020610
#R-square measure is ~0.2
# Evaluating the model performance
p.rpart <- predict(m.rpart, white_test)
summary(p.rpart)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.995 5.463 5.882 5.999 6.296 6.716
summary(white_test$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.848 6.000 8.000
# From these summaries, we can say that our model is not good for estimating really bad and really good wine.
#lets measure the performance with the mean absolute error
MAE <- function(actual, predicted) {
mean(abs(actual - predicted))
}
# MAE for our predictions
MAE(p.rpart, white_test$quality)
## [1] 0.5732104
# MAE of 0.57.
#######################################
#Lets use Decision tree for red-wine
#######################################
# Decision tree
m.rpart1 <-rpart(quality~., data=red_train)
m.rpart1
## n= 1200
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 1200 785.330000 5.665000
## 2) alcohol< 11.45 987 512.729500 5.495441
## 4) volatile.acidity>=0.3625 823 363.798300 5.390036
## 8) sulphates< 0.575 309 105.229800 5.158576 *
## 9) sulphates>=0.575 514 232.062300 5.529183
## 18) total.sulfur.dioxide>=50.5 192 56.369790 5.276042 *
## 19) total.sulfur.dioxide< 50.5 322 156.052800 5.680124
## 38) residual.sugar< 5.325 306 138.457500 5.640523
## 76) alcohol< 10.45 209 84.047850 5.531100 *
## 77) alcohol>=10.45 97 46.515460 5.876289 *
## 39) residual.sugar>=5.325 16 7.937500 6.437500 *
## 5) volatile.acidity< 0.3625 164 93.902440 6.024390
## 10) alcohol< 9.75 53 17.132080 5.547170 *
## 11) alcohol>=9.75 111 58.936940 6.252252
## 22) fixed.acidity< 7.45 12 4.916667 5.416667 *
## 23) fixed.acidity>=7.45 99 44.626260 6.353535 *
## 3) alcohol>=11.45 213 112.732400 6.450704
## 6) sulphates< 0.685 115 51.791300 6.191304 *
## 7) sulphates>=0.685 98 44.122450 6.755102 *
#Summary
summary(m.rpart1)
## Call:
## rpart(formula = quality ~ ., data = red_train)
## n= 1200
##
## CP nsplit rel error xerror xstd
## 1 0.20356808 0 1.0000000 1.0016219 0.04109923
## 2 0.07007086 1 0.7964319 0.8410013 0.03868634
## 3 0.03375176 2 0.7263611 0.8116537 0.03974462
## 4 0.02500818 3 0.6926093 0.7800651 0.03836102
## 5 0.02270819 4 0.6676011 0.7619690 0.03756087
## 6 0.02141602 5 0.6448929 0.7603761 0.03764648
## 7 0.01229773 6 0.6234769 0.7266420 0.03654322
## 8 0.01196186 7 0.6111792 0.7159890 0.03634829
## 9 0.01005209 8 0.5992173 0.7163799 0.03646044
## 10 0.01000000 9 0.5891652 0.7100659 0.03686808
##
## Variable importance
## alcohol volatile.acidity sulphates
## 47 15 12
## fixed.acidity total.sulfur.dioxide chlorides
## 7 6 4
## residual.sugar free.sulfur.dioxide pH
## 3 3 2
## citric.acid
## 2
##
## Node number 1: 1200 observations, complexity param=0.2035681
## mean=5.665, MSE=0.6544417
## left son=2 (987 obs) right son=3 (213 obs)
## Primary splits:
## alcohol < 11.45 to the left, improve=0.20356810, (0 missing)
## volatile.acidity < 0.3625 to the right, improve=0.11984750, (0 missing)
## sulphates < 0.645 to the left, improve=0.11076200, (0 missing)
## citric.acid < 0.305 to the left, improve=0.07121436, (0 missing)
## total.sulfur.dioxide < 50.5 to the right, improve=0.05185390, (0 missing)
## Surrogate splits:
## chlorides < 0.0525 to the right, agree=0.838, adj=0.085, (0 split)
## fixed.acidity < 5.65 to the right, agree=0.836, adj=0.075, (0 split)
## pH < 3.7 to the left, agree=0.829, adj=0.038, (0 split)
## volatile.acidity < 0.15 to the right, agree=0.825, adj=0.014, (0 split)
## residual.sugar < 1.25 to the right, agree=0.825, adj=0.014, (0 split)
##
## Node number 2: 987 observations, complexity param=0.07007086
## mean=5.495441, MSE=0.5194828
## left son=4 (823 obs) right son=5 (164 obs)
## Primary splits:
## volatile.acidity < 0.3625 to the right, improve=0.10732510, (0 missing)
## alcohol < 9.85 to the left, improve=0.09959044, (0 missing)
## sulphates < 0.575 to the left, improve=0.09928169, (0 missing)
## total.sulfur.dioxide < 83.5 to the right, improve=0.05461431, (0 missing)
## fixed.acidity < 9.95 to the left, improve=0.05141429, (0 missing)
## Surrogate splits:
## free.sulfur.dioxide < 1.5 to the right, agree=0.835, adj=0.006, (0 split)
##
## Node number 3: 213 observations, complexity param=0.02141602
## mean=6.450704, MSE=0.5292601
## left son=6 (115 obs) right son=7 (98 obs)
## Primary splits:
## sulphates < 0.685 to the left, improve=0.14919080, (0 missing)
## citric.acid < 0.315 to the left, improve=0.07322301, (0 missing)
## fixed.acidity < 6.55 to the left, improve=0.07275680, (0 missing)
## pH < 3.385 to the right, improve=0.06663666, (0 missing)
## volatile.acidity < 0.425 to the right, improve=0.05271640, (0 missing)
## Surrogate splits:
## fixed.acidity < 8.1 to the left, agree=0.657, adj=0.255, (0 split)
## citric.acid < 0.305 to the left, agree=0.606, adj=0.143, (0 split)
## total.sulfur.dioxide < 14.5 to the left, agree=0.582, adj=0.092, (0 split)
## free.sulfur.dioxide < 9.5 to the left, agree=0.577, adj=0.082, (0 split)
## alcohol < 13.25 to the left, agree=0.573, adj=0.071, (0 split)
##
## Node number 4: 823 observations, complexity param=0.03375176
## mean=5.390036, MSE=0.4420392
## left son=8 (309 obs) right son=9 (514 obs)
## Primary splits:
## sulphates < 0.575 to the left, improve=0.07285979, (0 missing)
## alcohol < 9.85 to the left, improve=0.07095052, (0 missing)
## volatile.acidity < 0.8425 to the right, improve=0.04513344, (0 missing)
## total.sulfur.dioxide < 64.5 to the right, improve=0.04466255, (0 missing)
## fixed.acidity < 9.95 to the left, improve=0.01318280, (0 missing)
## Surrogate splits:
## volatile.acidity < 0.7175 to the right, agree=0.661, adj=0.097, (0 split)
## chlorides < 0.0545 to the left, agree=0.633, adj=0.023, (0 split)
## total.sulfur.dioxide < 14.5 to the left, agree=0.631, adj=0.016, (0 split)
## fixed.acidity < 6.35 to the left, agree=0.629, adj=0.013, (0 split)
## citric.acid < 0.105 to the left, agree=0.628, adj=0.010, (0 split)
##
## Node number 5: 164 observations, complexity param=0.02270819
## mean=6.02439, MSE=0.5725758
## left son=10 (53 obs) right son=11 (111 obs)
## Primary splits:
## alcohol < 9.75 to the left, improve=0.1899144, (0 missing)
## sulphates < 0.635 to the left, improve=0.1814382, (0 missing)
## total.sulfur.dioxide < 47.5 to the right, improve=0.1326863, (0 missing)
## fixed.acidity < 7.45 to the left, improve=0.1224836, (0 missing)
## citric.acid < 0.305 to the left, improve=0.1061851, (0 missing)
## Surrogate splits:
## sulphates < 0.505 to the left, agree=0.726, adj=0.151, (0 split)
## total.sulfur.dioxide < 79.5 to the right, agree=0.707, adj=0.094, (0 split)
## fixed.acidity < 13.15 to the right, agree=0.701, adj=0.075, (0 split)
## pH < 2.96 to the left, agree=0.695, adj=0.057, (0 split)
## chlorides < 0.1045 to the right, agree=0.689, adj=0.038, (0 split)
##
## Node number 6: 115 observations
## mean=6.191304, MSE=0.4503592
##
## Node number 7: 98 observations
## mean=6.755102, MSE=0.4502291
##
## Node number 8: 309 observations
## mean=5.158576, MSE=0.3405494
##
## Node number 9: 514 observations, complexity param=0.02500818
## mean=5.529183, MSE=0.451483
## left son=18 (192 obs) right son=19 (322 obs)
## Primary splits:
## total.sulfur.dioxide < 50.5 to the right, improve=0.08463104, (0 missing)
## alcohol < 9.85 to the left, improve=0.08226617, (0 missing)
## volatile.acidity < 0.605 to the right, improve=0.03986347, (0 missing)
## chlorides < 0.0975 to the right, improve=0.03245228, (0 missing)
## free.sulfur.dioxide < 33.5 to the right, improve=0.02284737, (0 missing)
## Surrogate splits:
## free.sulfur.dioxide < 19.5 to the right, agree=0.786, adj=0.427, (0 split)
## sulphates < 0.97 to the right, agree=0.652, adj=0.068, (0 split)
## residual.sugar < 2.75 to the right, agree=0.648, adj=0.057, (0 split)
## volatile.acidity < 0.7225 to the right, agree=0.642, adj=0.042, (0 split)
## chlorides < 0.0975 to the right, agree=0.638, adj=0.031, (0 split)
##
## Node number 10: 53 observations
## mean=5.54717, MSE=0.3232467
##
## Node number 11: 111 observations, complexity param=0.01196186
## mean=6.252252, MSE=0.5309634
## left son=22 (12 obs) right son=23 (99 obs)
## Primary splits:
## fixed.acidity < 7.45 to the left, improve=0.15939080, (0 missing)
## pH < 3.265 to the right, improve=0.13081510, (0 missing)
## sulphates < 0.635 to the left, improve=0.12960490, (0 missing)
## citric.acid < 0.335 to the left, improve=0.12550520, (0 missing)
## total.sulfur.dioxide < 47.5 to the right, improve=0.08427302, (0 missing)
## Surrogate splits:
## citric.acid < 0.295 to the left, agree=0.928, adj=0.333, (0 split)
##
## Node number 18: 192 observations
## mean=5.276042, MSE=0.2935927
##
## Node number 19: 322 observations, complexity param=0.01229773
## mean=5.680124, MSE=0.484636
## left son=38 (306 obs) right son=39 (16 obs)
## Primary splits:
## residual.sugar < 5.325 to the left, improve=0.06188789, (0 missing)
## alcohol < 9.85 to the left, improve=0.06139566, (0 missing)
## sulphates < 0.675 to the left, improve=0.04352325, (0 missing)
## free.sulfur.dioxide < 5.25 to the left, improve=0.02921466, (0 missing)
## volatile.acidity < 0.605 to the right, improve=0.02690070, (0 missing)
##
## Node number 22: 12 observations
## mean=5.416667, MSE=0.4097222
##
## Node number 23: 99 observations
## mean=6.353535, MSE=0.4507703
##
## Node number 38: 306 observations, complexity param=0.01005209
## mean=5.640523, MSE=0.4524755
## left son=76 (209 obs) right son=77 (97 obs)
## Primary splits:
## alcohol < 10.45 to the left, improve=0.05701536, (0 missing)
## volatile.acidity < 0.605 to the right, improve=0.03199175, (0 missing)
## sulphates < 0.675 to the left, improve=0.03147389, (0 missing)
## free.sulfur.dioxide < 5.25 to the left, improve=0.02845508, (0 missing)
## citric.acid < 0.505 to the left, improve=0.02237595, (0 missing)
## Surrogate splits:
## fixed.acidity < 13.85 to the left, agree=0.699, adj=0.052, (0 split)
## residual.sugar < 3.65 to the left, agree=0.699, adj=0.052, (0 split)
## pH < 2.97 to the right, agree=0.699, adj=0.052, (0 split)
## citric.acid < 0.685 to the left, agree=0.696, adj=0.041, (0 split)
## chlorides < 0.056 to the right, agree=0.696, adj=0.041, (0 split)
##
## Node number 39: 16 observations
## mean=6.4375, MSE=0.4960938
##
## Node number 76: 209 observations
## mean=5.5311, MSE=0.4021428
##
## Node number 77: 97 observations
## mean=5.876289, MSE=0.4795409
# Visualizing the decision tree
rpart.plot(m.rpart1, digits = 3)
#lets see R-square for our medel
rsq.rpart(m.rpart1)
##
## Regression tree:
## rpart(formula = quality ~ ., data = red_train)
##
## Variables actually used in tree construction:
## [1] alcohol fixed.acidity residual.sugar
## [4] sulphates total.sulfur.dioxide volatile.acidity
##
## Root node error: 785.33/1200 = 0.65444
##
## n= 1200
##
## CP nsplit rel error xerror xstd
## 1 0.203568 0 1.00000 1.00162 0.041099
## 2 0.070071 1 0.79643 0.84100 0.038686
## 3 0.033752 2 0.72636 0.81165 0.039745
## 4 0.025008 3 0.69261 0.78007 0.038361
## 5 0.022708 4 0.66760 0.76197 0.037561
## 6 0.021416 5 0.64489 0.76038 0.037646
## 7 0.012298 6 0.62348 0.72664 0.036543
## 8 0.011962 7 0.61118 0.71599 0.036348
## 9 0.010052 8 0.59922 0.71638 0.036460
## 10 0.010000 9 0.58917 0.71007 0.036868
#R-square measure is ~0.2
# Evaluating the model performance
p.rpart1 <- predict(m.rpart1, red_test)
summary(p.rpart1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.159 5.159 5.531 5.620 5.876 6.755
summary(red_test$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.549 6.000 8.000
# From these summaries, we can say that our model is not good for estimating really bad and really good wine.
#lets measure the performance with the mean absolute error
MAE <- function(actual, predicted) {
mean(abs(actual - predicted))
}
# MAE for our predictions
MAE(p.rpart1, red_test$quality)
## [1] 0.5647755
# Mean absolute error of 0.56.
# Conclusion: Decision tree doesn't give us a good model for white and red wines.
First, we confirmed from our EDA that density has a variance inflation factor of over 10, and that by removing it, all other variables lower their variance inflation factor to below 10, eliminating multicollinearity. we run models separately for white and red and conclude that the dataset are correct to combine into one dataset, but that there still may be significant differences between red and white wines. Therefore, we take interaction terms of three variables (citric acid, sulphates, and total SO2) and later show that their addition improves our model’s diagnositics.
# Red
wine_red_df1 <- read.csv("winequality-red.csv", sep=",")
wine_red_df1$quality.dum[wine_red_df1$quality < 7] <- 0
wine_red_df1$quality.dum[wine_red_df1$quality >= 7] <- 1
wine_red_df1$quality <- NULL
wine_red_df1_std <- data.frame(wine_red_df1)
for(col in names(wine_red_df1)){
wine_red_df1_std[,col] <- (wine_red_df1[,col]-min(wine_red_df1[,col]))/(max(wine_red_df1[,col])-min(wine_red_df1[,col]))
}
model_red_std1 <- glm(quality.dum~., family = binomial(link = "logit"), data = wine_red_df1_std)
summary(model_red_std1)
##
## Call:
## glm(formula = quality.dum ~ ., family = binomial(link = "logit"),
## data = wine_red_df1_std)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9878 -0.4351 -0.2207 -0.1222 2.9869
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.3189 0.8339 -3.980 6.89e-05 ***
## fixed.acidity 3.1070 1.4156 2.195 0.028183 *
## volatile.acidity -3.7683 1.1451 -3.291 0.000999 ***
## citric.acid 0.5678 0.8385 0.677 0.498313
## residual.sugar 3.4962 1.0765 3.248 0.001163 **
## chlorides -5.2810 2.0155 -2.620 0.008788 **
## free.sulfur.dioxide 0.7683 0.8687 0.884 0.376469
## total.sulfur.dioxide -4.6782 1.3850 -3.378 0.000731 ***
## density -3.5112 1.5036 -2.335 0.019536 *
## pH 0.2847 1.2679 0.225 0.822327
## sulphates 6.2623 0.9044 6.924 4.39e-12 ***
## alcohol 4.8967 0.8555 5.724 1.04e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1269.92 on 1598 degrees of freedom
## Residual deviance: 870.86 on 1587 degrees of freedom
## AIC: 894.86
##
## Number of Fisher Scoring iterations: 6
# White
wine_white_df1 <- read.csv("winequality-white.csv", sep=",")
wine_white_df1$quality.dum[wine_white_df1$quality < 7] <- 0
wine_white_df1$quality.dum[wine_white_df1$quality >= 7] <- 1
wine_white_df1$quality <- NULL
wine_white_df1_std <- data.frame(wine_white_df1)
for(col in names(wine_white_df1)){
wine_white_df1_std[,col] <- (wine_white_df1[,col]-min(wine_white_df1[,col]))/(max(wine_white_df1[,col])-min(wine_white_df1[,col]))
}
wine_white_std1 <- glm(quality.dum~., family = binomial(link = "logit"), data = wine_white_df1_std)
summary(wine_white_std1)
##
## Call:
## glm(formula = quality.dum ~ ., family = binomial(link = "logit"),
## data = wine_white_df1_std)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1436 -0.6725 -0.4114 -0.1798 2.8331
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.8412 0.4622 -3.984 6.79e-05 ***
## fixed.acidity 5.7419 0.9415 6.099 1.07e-09 ***
## volatile.acidity -3.8606 0.4982 -7.749 9.28e-15 ***
## citric.acid -1.2247 0.6656 -1.840 0.065776 .
## residual.sugar 19.2467 2.3235 8.283 < 2e-16 ***
## chlorides -4.2597 1.2861 -3.312 0.000926 ***
## free.sulfur.dioxide 2.4810 0.8984 2.762 0.005749 **
## total.sulfur.dioxide -0.1162 0.6491 -0.179 0.857936
## density -34.1863 4.9482 -6.909 4.89e-12 ***
## pH 3.6773 0.4695 7.832 4.81e-15 ***
## sulphates 1.8643 0.2988 6.238 4.42e-10 ***
## alcohol 0.8825 0.7061 1.250 0.211334
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5116.8 on 4897 degrees of freedom
## Residual deviance: 4143.2 on 4886 degrees of freedom
## AIC: 4167.2
##
## Number of Fisher Scoring iterations: 6
### Standardized Model ###
wine_df1 <- read.csv("Combined_Wine_Data_NumType.csv", sep=",")
wine_df1$quality.dum[wine_df1$quality < 7] <- 0
wine_df1$quality.dum[wine_df1$quality >= 7] <- 1
head(wine_df1$quality.dum, 20)
## [1] 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0
wine_df1$quality <- NULL
summary(wine_df1)
## type fixed.acidity volatile.acidity citric.acid
## Min. :0.0000 Min. : 3.800 Min. :0.0800 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.: 6.400 1st Qu.:0.2300 1st Qu.:0.2500
## Median :0.0000 Median : 7.000 Median :0.2900 Median :0.3100
## Mean :0.2461 Mean : 7.215 Mean :0.3397 Mean :0.3186
## 3rd Qu.:0.0000 3rd Qu.: 7.700 3rd Qu.:0.4000 3rd Qu.:0.3900
## Max. :1.0000 Max. :15.900 Max. :1.5800 Max. :1.6600
## residual.sugar chlorides free.sulfur.dioxide
## Min. : 0.600 Min. :0.00900 Min. : 1.00
## 1st Qu.: 1.800 1st Qu.:0.03800 1st Qu.: 17.00
## Median : 3.000 Median :0.04700 Median : 29.00
## Mean : 5.443 Mean :0.05603 Mean : 30.53
## 3rd Qu.: 8.100 3rd Qu.:0.06500 3rd Qu.: 41.00
## Max. :65.800 Max. :0.61100 Max. :289.00
## total.sulfur.dioxide density pH sulphates
## Min. : 6.0 Min. :0.9871 Min. :2.720 Min. :0.2200
## 1st Qu.: 77.0 1st Qu.:0.9923 1st Qu.:3.110 1st Qu.:0.4300
## Median :118.0 Median :0.9949 Median :3.210 Median :0.5100
## Mean :115.7 Mean :0.9947 Mean :3.219 Mean :0.5313
## 3rd Qu.:156.0 3rd Qu.:0.9970 3rd Qu.:3.320 3rd Qu.:0.6000
## Max. :440.0 Max. :1.0390 Max. :4.010 Max. :2.0000
## alcohol quality.dum
## Min. : 8.00 Min. :0.0000
## 1st Qu.: 9.50 1st Qu.:0.0000
## Median :10.30 Median :0.0000
## Mean :10.49 Mean :0.1966
## 3rd Qu.:11.30 3rd Qu.:0.0000
## Max. :14.90 Max. :1.0000
wine_df1_std <- data.frame(wine_df1)
for(col in names(wine_df1)){
wine_df1_std[,col] <- (wine_df1[,col]-min(wine_df1[,col]))/(max(wine_df1[,col])-min(wine_df1[,col]))
}
model_std1 <- glm(quality.dum~., family = binomial(link = "logit"), data = wine_df1_std)
summary(model_std1)
##
## Call:
## glm(formula = quality.dum ~ ., family = binomial(link = "logit"),
## data = wine_df1_std)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8136 -0.6291 -0.3686 -0.1759 3.0534
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.3535 0.3515 -9.540 < 2e-16 ***
## type 0.7747 0.2446 3.167 0.001538 **
## fixed.acidity 5.9778 0.8120 7.362 1.82e-13 ***
## volatile.acidity -5.4946 0.5823 -9.437 < 2e-16 ***
## citric.acid -0.4130 0.5740 -0.719 0.471841
## residual.sugar 14.3284 1.7124 8.367 < 2e-16 ***
## chlorides -4.5822 1.5035 -3.048 0.002306 **
## free.sulfur.dioxide 3.1100 0.8505 3.657 0.000255 ***
## total.sulfur.dioxide -1.6080 0.5793 -2.776 0.005509 **
## density -21.9816 3.4611 -6.351 2.14e-10 ***
## pH 3.3494 0.4662 7.185 6.73e-13 ***
## sulphates 4.3763 0.5080 8.615 < 2e-16 ***
## alcohol 3.1333 0.5577 5.618 1.93e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6439.6 on 6496 degrees of freedom
## Residual deviance: 5078.8 on 6484 degrees of freedom
## AIC: 5104.8
##
## Number of Fisher Scoring iterations: 6
library(car)
round(vif(model_std1),2)
## type fixed.acidity volatile.acidity
## 6.99 6.28 1.75
## citric.acid residual.sugar chlorides
## 1.52 10.87 2.18
## free.sulfur.dioxide total.sulfur.dioxide density
## 2.13 3.67 31.06
## pH sulphates alcohol
## 2.70 1.61 7.04
### Multicollinearity is an issue
### Redo based on removing for Multicollinearity ###
# Red
wine_red_df2 <- read.csv("winequality-red.csv", sep=",")
wine_red_df2$quality.dum[wine_red_df2$quality < 7] <- 0
wine_red_df2$quality.dum[wine_red_df2$quality >= 7] <- 1
wine_red_df2$quality <- NULL
wine_red_df2_std <- data.frame(wine_red_df2)
for(col in names(wine_red_df2)){
wine_red_df2_std[,col] <- (wine_red_df2[,col]-min(wine_red_df2[,col]))/(max(wine_red_df2[,col])-min(wine_red_df2[,col]))
}
summary(wine_red_df2_std)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. :0.0000 Min. :0.0000 Min. :0.000 Min. :0.00000
## 1st Qu.:0.2212 1st Qu.:0.1849 1st Qu.:0.090 1st Qu.:0.06849
## Median :0.2920 Median :0.2740 Median :0.260 Median :0.08904
## Mean :0.3292 Mean :0.2793 Mean :0.271 Mean :0.11225
## 3rd Qu.:0.4071 3rd Qu.:0.3562 3rd Qu.:0.420 3rd Qu.:0.11644
## Max. :1.0000 Max. :1.0000 Max. :1.000 Max. :1.00000
## chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.09683 1st Qu.:0.08451 1st Qu.:0.05654
## Median :0.11185 Median :0.18310 Median :0.11307
## Mean :0.12599 Mean :0.20951 Mean :0.14300
## 3rd Qu.:0.13022 3rd Qu.:0.28169 3rd Qu.:0.19788
## Max. :1.00000 Max. :1.00000 Max. :1.00000
## density pH sulphates alcohol
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.4060 1st Qu.:0.3701 1st Qu.:0.1317 1st Qu.:0.1692
## Median :0.4905 Median :0.4488 Median :0.1737 Median :0.2769
## Mean :0.4902 Mean :0.4497 Mean :0.1965 Mean :0.3112
## 3rd Qu.:0.5701 3rd Qu.:0.5197 3rd Qu.:0.2395 3rd Qu.:0.4154
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## quality.dum
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1357
## 3rd Qu.:0.0000
## Max. :1.0000
wine_red_df2_std$density <- NULL
model_red_std2 <- glm(quality.dum~., family = binomial(link = "logit"), data = wine_red_df2_std)
summary(model_red_std2)
##
## Call:
## glm(formula = quality.dum ~ ., family = binomial(link = "logit"),
## data = wine_red_df2_std)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7543 -0.4347 -0.2251 -0.1204 3.0677
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.2384 0.8348 -3.879 0.000105 ***
## fixed.acidity 0.5274 0.8911 0.592 0.553948
## volatile.acidity -4.5489 1.1185 -4.067 4.76e-05 ***
## citric.acid 0.2618 0.8287 0.316 0.752059
## residual.sugar 2.0245 0.9055 2.236 0.025369 *
## chlorides -6.0781 2.1395 -2.841 0.004499 **
## free.sulfur.dioxide 0.9659 0.8780 1.100 0.271275
## total.sulfur.dioxide -4.9921 1.4332 -3.483 0.000495 ***
## pH -1.3483 1.0710 -1.259 0.208087
## sulphates 5.7111 0.8776 6.508 7.63e-11 ***
## alcohol 6.3677 0.5898 10.796 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1269.92 on 1598 degrees of freedom
## Residual deviance: 876.34 on 1588 degrees of freedom
## AIC: 898.34
##
## Number of Fisher Scoring iterations: 6
# White
wine_white_df2 <- read.csv("winequality-white.csv", sep=",")
wine_white_df2$quality.dum[wine_white_df2$quality < 7] <- 0
wine_white_df2$quality.dum[wine_white_df2$quality >= 7] <- 1
wine_white_df2$quality <- NULL
wine_white_df2_std <- data.frame(wine_white_df2)
for(col in names(wine_white_df2)){
wine_white_df2_std[,col] <- (wine_white_df2[,col]-min(wine_white_df2[,col]))/(max(wine_white_df2[,col])-min(wine_white_df2[,col]))
}
summary(wine_white_df2_std)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.2404 1st Qu.:0.1275 1st Qu.:0.1627 1st Qu.:0.01687
## Median :0.2885 Median :0.1765 Median :0.1928 Median :0.07055
## Mean :0.2937 Mean :0.1944 Mean :0.2013 Mean :0.08883
## 3rd Qu.:0.3365 3rd Qu.:0.2353 3rd Qu.:0.2349 3rd Qu.:0.14264
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.08012 1st Qu.:0.07317 1st Qu.:0.2297
## Median :0.10089 Median :0.11150 Median :0.2900
## Mean :0.10912 Mean :0.11606 Mean :0.3001
## 3rd Qu.:0.12166 3rd Qu.:0.15331 3rd Qu.:0.3666
## Max. :1.00000 Max. :1.00000 Max. :1.0000
## density pH sulphates alcohol
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.08892 1st Qu.:0.3364 1st Qu.:0.2209 1st Qu.:0.2419
## Median :0.12782 Median :0.4182 Median :0.2907 Median :0.3871
## Mean :0.13336 Mean :0.4257 Mean :0.3138 Mean :0.4055
## 3rd Qu.:0.17332 3rd Qu.:0.5091 3rd Qu.:0.3837 3rd Qu.:0.5484
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## quality.dum
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2164
## 3rd Qu.:0.0000
## Max. :1.0000
wine_white_df2_std$density <- NULL
wine_white_std2 <- glm(quality.dum~., family = binomial(link = "logit"), data = wine_white_df2_std)
summary(wine_white_std2)
##
## Call:
## glm(formula = quality.dum ~ ., family = binomial(link = "logit"),
## data = wine_white_df2_std)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9830 -0.6676 -0.4285 -0.1864 3.0170
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.5868 0.3739 -9.592 < 2e-16 ***
## fixed.acidity 0.6635 0.5753 1.153 0.2488
## volatile.acidity -4.0164 0.4915 -8.171 3.05e-16 ***
## citric.acid -1.4734 0.6598 -2.233 0.0255 *
## residual.sugar 3.7341 0.6397 5.837 5.31e-09 ***
## chlorides -6.0874 1.3082 -4.653 3.26e-06 ***
## free.sulfur.dioxide 3.6729 0.8712 4.216 2.49e-05 ***
## total.sulfur.dioxide -1.3977 0.6149 -2.273 0.0230 *
## pH 1.3476 0.3219 4.187 2.83e-05 ***
## sulphates 1.0954 0.2753 3.979 6.92e-05 ***
## alcohol 5.4160 0.2760 19.621 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5116.8 on 4897 degrees of freedom
## Residual deviance: 4196.5 on 4887 degrees of freedom
## AIC: 4218.5
##
## Number of Fisher Scoring iterations: 6
### Standardized Model ###
wine_df2 <- read.csv("Combined_Wine_Data_NumType.csv", sep=",")
wine_df2$quality.dum[wine_df2$quality < 7] <- 0
wine_df2$quality.dum[wine_df2$quality >= 7] <- 1
head(wine_df2$quality.dum, 20)
## [1] 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0
wine_df2$quality <- NULL
wine_df2$density <- NULL
summary(wine_df2)
## type fixed.acidity volatile.acidity citric.acid
## Min. :0.0000 Min. : 3.800 Min. :0.0800 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.: 6.400 1st Qu.:0.2300 1st Qu.:0.2500
## Median :0.0000 Median : 7.000 Median :0.2900 Median :0.3100
## Mean :0.2461 Mean : 7.215 Mean :0.3397 Mean :0.3186
## 3rd Qu.:0.0000 3rd Qu.: 7.700 3rd Qu.:0.4000 3rd Qu.:0.3900
## Max. :1.0000 Max. :15.900 Max. :1.5800 Max. :1.6600
## residual.sugar chlorides free.sulfur.dioxide
## Min. : 0.600 Min. :0.00900 Min. : 1.00
## 1st Qu.: 1.800 1st Qu.:0.03800 1st Qu.: 17.00
## Median : 3.000 Median :0.04700 Median : 29.00
## Mean : 5.443 Mean :0.05603 Mean : 30.53
## 3rd Qu.: 8.100 3rd Qu.:0.06500 3rd Qu.: 41.00
## Max. :65.800 Max. :0.61100 Max. :289.00
## total.sulfur.dioxide pH sulphates alcohol
## Min. : 6.0 Min. :2.720 Min. :0.2200 Min. : 8.00
## 1st Qu.: 77.0 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.: 9.50
## Median :118.0 Median :3.210 Median :0.5100 Median :10.30
## Mean :115.7 Mean :3.219 Mean :0.5313 Mean :10.49
## 3rd Qu.:156.0 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.:11.30
## Max. :440.0 Max. :4.010 Max. :2.0000 Max. :14.90
## quality.dum
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1966
## 3rd Qu.:0.0000
## Max. :1.0000
wine_df2_std <- data.frame(wine_df2)
for(col in names(wine_df2)){
wine_df2_std[,col] <- (wine_df2[,col]-min(wine_df2[,col]))/(max(wine_df2[,col])-min(wine_df2[,col]))
}
summary(wine_df2_std)
## type fixed.acidity volatile.acidity citric.acid
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.2149 1st Qu.:0.1000 1st Qu.:0.1506
## Median :0.0000 Median :0.2645 Median :0.1400 Median :0.1867
## Mean :0.2461 Mean :0.2823 Mean :0.1731 Mean :0.1919
## 3rd Qu.:0.0000 3rd Qu.:0.3223 3rd Qu.:0.2133 3rd Qu.:0.2349
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## residual.sugar chlorides free.sulfur.dioxide
## Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.01840 1st Qu.:0.04817 1st Qu.:0.05556
## Median :0.03681 Median :0.06312 Median :0.09722
## Mean :0.07428 Mean :0.07813 Mean :0.10252
## 3rd Qu.:0.11503 3rd Qu.:0.09302 3rd Qu.:0.13889
## Max. :1.00000 Max. :1.00000 Max. :1.00000
## total.sulfur.dioxide pH sulphates alcohol
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1636 1st Qu.:0.3023 1st Qu.:0.1180 1st Qu.:0.2174
## Median :0.2581 Median :0.3798 Median :0.1629 Median :0.3333
## Mean :0.2529 Mean :0.3864 Mean :0.1749 Mean :0.3611
## 3rd Qu.:0.3456 3rd Qu.:0.4651 3rd Qu.:0.2135 3rd Qu.:0.4783
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## quality.dum
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1966
## 3rd Qu.:0.0000
## Max. :1.0000
model_std2 <- glm(quality.dum~., family = binomial(link = "logit"), data = wine_df2_std)
summary(model_std2)
##
## Call:
## glm(formula = quality.dum ~ ., family = binomial(link = "logit"),
## data = wine_df2_std)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5798 -0.6376 -0.3789 -0.1717 3.1758
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.21709 0.31334 -13.459 < 2e-16 ***
## type -0.09455 0.20284 -0.466 0.641114
## fixed.acidity 1.92010 0.50027 3.838 0.000124 ***
## volatile.acidity -5.95987 0.57818 -10.308 < 2e-16 ***
## citric.acid -0.68223 0.57079 -1.195 0.231996
## residual.sugar 4.04457 0.61095 6.620 3.59e-11 ***
## chlorides -6.89350 1.60676 -4.290 1.78e-05 ***
## free.sulfur.dioxide 3.83385 0.83585 4.587 4.50e-06 ***
## total.sulfur.dioxide -2.27284 0.56864 -3.997 6.42e-05 ***
## pH 1.37311 0.34673 3.960 7.49e-05 ***
## sulphates 3.31803 0.48450 6.848 7.47e-12 ***
## alcohol 6.25881 0.26396 23.711 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6439.6 on 6496 degrees of freedom
## Residual deviance: 5122.6 on 6485 degrees of freedom
## AIC: 5146.6
##
## Number of Fisher Scoring iterations: 6
round(vif(model_std2),2)
## type fixed.acidity volatile.acidity
## 4.81 2.42 1.77
## citric.acid residual.sugar chlorides
## 1.52 1.47 2.26
## free.sulfur.dioxide total.sulfur.dioxide pH
## 2.10 3.56 1.53
## sulphates alcohol
## 1.43 1.58
### No Multicollinearity
First, we confirm our first hypothesis that alcohol is significant with an alpha 0.05 and positively associated with quality. Second, we confirm our second hypothesis that sugar is also significant and positively associated with quality. Note, residual sugar is significant for both red and white, and not significantly higher for white wine, like we originally thought.
#setwd("C:\\Users\\Siebelm\\Documents\\3 GWU\\1 DATS 6101\\Project\\Final")
wine_df <- read.csv("Combined_Wine_Data_NumType.csv", sep=",")
wine_df$quality.dum[wine_df$quality < 7] <- 0
wine_df$quality.dum[wine_df$quality >= 7] <- 1
wine_df$quality <- NULL
wine_df$density <- NULL
wine_df_std <- data.frame(wine_df)
for(col in names(wine_df)){
wine_df_std[,col] <- (wine_df[,col]-min(wine_df[,col]))/(max(wine_df[,col])-min(wine_df[,col]))
}
summary(wine_df_std)
## type fixed.acidity volatile.acidity citric.acid
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.2149 1st Qu.:0.1000 1st Qu.:0.1506
## Median :0.0000 Median :0.2645 Median :0.1400 Median :0.1867
## Mean :0.2461 Mean :0.2823 Mean :0.1731 Mean :0.1919
## 3rd Qu.:0.0000 3rd Qu.:0.3223 3rd Qu.:0.2133 3rd Qu.:0.2349
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## residual.sugar chlorides free.sulfur.dioxide
## Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.01840 1st Qu.:0.04817 1st Qu.:0.05556
## Median :0.03681 Median :0.06312 Median :0.09722
## Mean :0.07428 Mean :0.07813 Mean :0.10252
## 3rd Qu.:0.11503 3rd Qu.:0.09302 3rd Qu.:0.13889
## Max. :1.00000 Max. :1.00000 Max. :1.00000
## total.sulfur.dioxide pH sulphates alcohol
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1636 1st Qu.:0.3023 1st Qu.:0.1180 1st Qu.:0.2174
## Median :0.2581 Median :0.3798 Median :0.1629 Median :0.3333
## Mean :0.2529 Mean :0.3864 Mean :0.1749 Mean :0.3611
## 3rd Qu.:0.3456 3rd Qu.:0.4651 3rd Qu.:0.2135 3rd Qu.:0.4783
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## quality.dum
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1966
## 3rd Qu.:0.0000
## Max. :1.0000
### Model without Interaction Terms
first_model_std <- glm(quality.dum~., family = binomial(link = "logit"), data = wine_df_std)
summary(first_model_std)
##
## Call:
## glm(formula = quality.dum ~ ., family = binomial(link = "logit"),
## data = wine_df_std)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5798 -0.6376 -0.3789 -0.1717 3.1758
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.21709 0.31334 -13.459 < 2e-16 ***
## type -0.09455 0.20284 -0.466 0.641114
## fixed.acidity 1.92010 0.50027 3.838 0.000124 ***
## volatile.acidity -5.95987 0.57818 -10.308 < 2e-16 ***
## citric.acid -0.68223 0.57079 -1.195 0.231996
## residual.sugar 4.04457 0.61095 6.620 3.59e-11 ***
## chlorides -6.89350 1.60676 -4.290 1.78e-05 ***
## free.sulfur.dioxide 3.83385 0.83585 4.587 4.50e-06 ***
## total.sulfur.dioxide -2.27284 0.56864 -3.997 6.42e-05 ***
## pH 1.37311 0.34673 3.960 7.49e-05 ***
## sulphates 3.31803 0.48450 6.848 7.47e-12 ***
## alcohol 6.25881 0.26396 23.711 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6439.6 on 6496 degrees of freedom
## Residual deviance: 5122.6 on 6485 degrees of freedom
## AIC: 5146.6
##
## Number of Fisher Scoring iterations: 6
round(vif(first_model_std),2)
## type fixed.acidity volatile.acidity
## 4.81 2.42 1.77
## citric.acid residual.sugar chlorides
## 1.52 1.47 2.26
## free.sulfur.dioxide total.sulfur.dioxide pH
## 2.10 3.56 1.53
## sulphates alcohol
## 1.43 1.58
### Model with Interaction Terms
model <- glm(quality.dum~fixed.acidity + volatile.acidity + residual.sugar +
chlorides + pH + total.sulfur.dioxide + alcohol +
citric.acid*type + sulphates*type + total.sulfur.dioxide*type,
family = binomial(link = "logit"), data = wine_df)
summary(model)
##
## Call:
## glm(formula = quality.dum ~ fixed.acidity + volatile.acidity +
## residual.sugar + chlorides + pH + total.sulfur.dioxide +
## alcohol + citric.acid * type + sulphates * type + total.sulfur.dioxide *
## type, family = binomial(link = "logit"), data = wine_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3865 -0.6408 -0.3825 -0.1345 3.2357
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.358e+01 1.164e+00 -11.669 < 2e-16 ***
## fixed.acidity 6.572e-02 4.385e-02 1.499 0.133899
## volatile.acidity -3.911e+00 3.931e-01 -9.949 < 2e-16 ***
## residual.sugar 5.862e-02 9.379e-03 6.250 4.10e-10 ***
## chlorides -1.349e+01 2.712e+00 -4.972 6.62e-07 ***
## pH 9.693e-01 2.710e-01 3.576 0.000349 ***
## total.sulfur.dioxide -1.993e-05 1.114e-03 -0.018 0.985728
## alcohol 8.958e-01 3.861e-02 23.199 < 2e-16 ***
## citric.acid -9.909e-01 3.891e-01 -2.547 0.010878 *
## type -1.210e+00 4.772e-01 -2.537 0.011193 *
## sulphates 1.313e+00 3.190e-01 4.116 3.86e-05 ***
## citric.acid:type 1.963e+00 6.413e-01 3.060 0.002210 **
## type:sulphates 2.151e+00 6.108e-01 3.521 0.000430 ***
## total.sulfur.dioxide:type -1.375e-02 3.714e-03 -3.702 0.000214 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6439.6 on 6496 degrees of freedom
## Residual deviance: 5107.5 on 6483 degrees of freedom
## AIC: 5135.5
##
## Number of Fisher Scoring iterations: 6
round(summary(model)$coefficients,2)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -13.58 1.16 -11.67 0.00
## fixed.acidity 0.07 0.04 1.50 0.13
## volatile.acidity -3.91 0.39 -9.95 0.00
## residual.sugar 0.06 0.01 6.25 0.00
## chlorides -13.49 2.71 -4.97 0.00
## pH 0.97 0.27 3.58 0.00
## total.sulfur.dioxide 0.00 0.00 -0.02 0.99
## alcohol 0.90 0.04 23.20 0.00
## citric.acid -0.99 0.39 -2.55 0.01
## type -1.21 0.48 -2.54 0.01
## sulphates 1.31 0.32 4.12 0.00
## citric.acid:type 1.96 0.64 3.06 0.00
## type:sulphates 2.15 0.61 3.52 0.00
## total.sulfur.dioxide:type -0.01 0.00 -3.70 0.00
model_std <- glm(quality.dum~fixed.acidity + volatile.acidity + residual.sugar +
chlorides + pH + total.sulfur.dioxide + alcohol +
citric.acid*type + sulphates*type + total.sulfur.dioxide*type,
family = binomial(link = "logit"), data = wine_df_std)
summary(model_std)
##
## Call:
## glm(formula = quality.dum ~ fixed.acidity + volatile.acidity +
## residual.sugar + chlorides + pH + total.sulfur.dioxide +
## alcohol + citric.acid * type + sulphates * type + total.sulfur.dioxide *
## type, family = binomial(link = "logit"), data = wine_df_std)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3865 -0.6408 -0.3825 -0.1345 3.2357
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.640740 0.328976 -11.067 < 2e-16 ***
## fixed.acidity 0.795258 0.530560 1.499 0.133899
## volatile.acidity -5.866126 0.589607 -9.949 < 2e-16 ***
## residual.sugar 3.822155 0.611527 6.250 4.10e-10 ***
## chlorides -8.118258 1.632773 -4.972 6.62e-07 ***
## pH 1.250444 0.349650 3.576 0.000349 ***
## total.sulfur.dioxide -0.008649 0.483502 -0.018 0.985728
## alcohol 6.181056 0.266439 23.199 < 2e-16 ***
## citric.acid -1.644932 0.645937 -2.547 0.010878 *
## type -0.819773 0.386240 -2.122 0.033801 *
## sulphates 2.337001 0.567834 4.116 3.86e-05 ***
## citric.acid:type 3.257980 1.064527 3.060 0.002210 **
## type:sulphates 3.828102 1.087221 3.521 0.000430 ***
## total.sulfur.dioxide:type -5.967505 1.611787 -3.702 0.000214 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6439.6 on 6496 degrees of freedom
## Residual deviance: 5107.5 on 6483 degrees of freedom
## AIC: 5135.5
##
## Number of Fisher Scoring iterations: 6
round(summary(model_std)$coefficients,2)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.64 0.33 -11.07 0.00
## fixed.acidity 0.80 0.53 1.50 0.13
## volatile.acidity -5.87 0.59 -9.95 0.00
## residual.sugar 3.82 0.61 6.25 0.00
## chlorides -8.12 1.63 -4.97 0.00
## pH 1.25 0.35 3.58 0.00
## total.sulfur.dioxide -0.01 0.48 -0.02 0.99
## alcohol 6.18 0.27 23.20 0.00
## citric.acid -1.64 0.65 -2.55 0.01
## type -0.82 0.39 -2.12 0.03
## sulphates 2.34 0.57 4.12 0.00
## citric.acid:type 3.26 1.06 3.06 0.00
## type:sulphates 3.83 1.09 3.52 0.00
## total.sulfur.dioxide:type -5.97 1.61 -3.70 0.00
We will show that our residuals are not great, and our AIC is high. However, we have a high hit rate of 81%. After dividing the sample, we get high sensitivity when the true positive rate reaches 82%, which has a trade off with a false positive rate of just below 40%. This means we are roughly twice as likely to get a true postive verses a false positive. Overall, our model is not perfect but it is strong enough to make predictions. It is better with interaction terms compared to without in terms of deviance residual, AIC, and goodness-of-fit.
### Diagnostics ###
wine_df <- read.csv("Combined_Wine_Data_NumType.csv", sep=",")
wine_df$quality.dum[wine_df$quality < 7] <- 0
wine_df$quality.dum[wine_df$quality >= 7] <- 1
wine_df$quality <- NULL
wine_df$density <- NULL
wine_df_std <- data.frame(wine_df)
for(col in names(wine_df)){
wine_df_std[,col] <- (wine_df[,col]-min(wine_df[,col]))/(max(wine_df[,col])-min(wine_df[,col]))
}
colnames(wine_df_std)
## [1] "type" "fixed.acidity" "volatile.acidity"
## [4] "citric.acid" "residual.sugar" "chlorides"
## [7] "free.sulfur.dioxide" "total.sulfur.dioxide" "pH"
## [10] "sulphates" "alcohol" "quality.dum"
first_model_std <- glm(quality.dum~., family = binomial(link = "logit"), data = wine_df_std)
summary(first_model_std)
##
## Call:
## glm(formula = quality.dum ~ ., family = binomial(link = "logit"),
## data = wine_df_std)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5798 -0.6376 -0.3789 -0.1717 3.1758
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.21709 0.31334 -13.459 < 2e-16 ***
## type -0.09455 0.20284 -0.466 0.641114
## fixed.acidity 1.92010 0.50027 3.838 0.000124 ***
## volatile.acidity -5.95987 0.57818 -10.308 < 2e-16 ***
## citric.acid -0.68223 0.57079 -1.195 0.231996
## residual.sugar 4.04457 0.61095 6.620 3.59e-11 ***
## chlorides -6.89350 1.60676 -4.290 1.78e-05 ***
## free.sulfur.dioxide 3.83385 0.83585 4.587 4.50e-06 ***
## total.sulfur.dioxide -2.27284 0.56864 -3.997 6.42e-05 ***
## pH 1.37311 0.34673 3.960 7.49e-05 ***
## sulphates 3.31803 0.48450 6.848 7.47e-12 ***
## alcohol 6.25881 0.26396 23.711 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6439.6 on 6496 degrees of freedom
## Residual deviance: 5122.6 on 6485 degrees of freedom
## AIC: 5146.6
##
## Number of Fisher Scoring iterations: 6
library(ResourceSelection)
## ResourceSelection 0.3-2 2017-02-28
hoslem.test(wine_df$quality.dum, fitted(first_model_std)) # Fail test
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: wine_df$quality.dum, fitted(first_model_std)
## X-squared = 34.747, df = 8, p-value = 2.973e-05
qqnorm(resid(first_model_std))
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
round(pR2(first_model_std),2) # McFadden Pseudo-R2 is above 0.2
## llh llhNull G2 McFadden r2ML r2CU
## -2561.31 -3219.82 1317.01 0.20 0.18 0.29
summary(model_std)
##
## Call:
## glm(formula = quality.dum ~ fixed.acidity + volatile.acidity +
## residual.sugar + chlorides + pH + total.sulfur.dioxide +
## alcohol + citric.acid * type + sulphates * type + total.sulfur.dioxide *
## type, family = binomial(link = "logit"), data = wine_df_std)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3865 -0.6408 -0.3825 -0.1345 3.2357
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.640740 0.328976 -11.067 < 2e-16 ***
## fixed.acidity 0.795258 0.530560 1.499 0.133899
## volatile.acidity -5.866126 0.589607 -9.949 < 2e-16 ***
## residual.sugar 3.822155 0.611527 6.250 4.10e-10 ***
## chlorides -8.118258 1.632773 -4.972 6.62e-07 ***
## pH 1.250444 0.349650 3.576 0.000349 ***
## total.sulfur.dioxide -0.008649 0.483502 -0.018 0.985728
## alcohol 6.181056 0.266439 23.199 < 2e-16 ***
## citric.acid -1.644932 0.645937 -2.547 0.010878 *
## type -0.819773 0.386240 -2.122 0.033801 *
## sulphates 2.337001 0.567834 4.116 3.86e-05 ***
## citric.acid:type 3.257980 1.064527 3.060 0.002210 **
## type:sulphates 3.828102 1.087221 3.521 0.000430 ***
## total.sulfur.dioxide:type -5.967505 1.611787 -3.702 0.000214 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6439.6 on 6496 degrees of freedom
## Residual deviance: 5107.5 on 6483 degrees of freedom
## AIC: 5135.5
##
## Number of Fisher Scoring iterations: 6
hoslem.test(wine_df$quality.dum, fitted(model_std)) # Fail test
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: wine_df$quality.dum, fitted(model_std)
## X-squared = 33.943, df = 8, p-value = 4.16e-05
qqnorm(resid(model_std))
round(pR2(model_std),2) # McFadden Pseudo-R2 is above 0.2
## llh llhNull G2 McFadden r2ML r2CU
## -2553.77 -3219.82 1332.10 0.21 0.19 0.29
prob <- predict(model, type = c("response"))
h <- roc(quality.dum~prob, data=wine_df)
h # Area under the curve: 0.8104
##
## Call:
## roc.formula(formula = quality.dum ~ prob, data = wine_df)
##
## Data: prob in 5220 controls (quality.dum 0) < 1277 cases (quality.dum 1).
## Area under the curve: 0.809
plot(h)
### Train v Test ###
View(wine_df)
wine_df_random <- wine_df[sample(1:nrow(wine_df)), ]
View(wine_df_random)
train <- wine_df_random[1:3248, ]
test <- wine_df_random[3249:6497, ]
train.model <- glm(quality.dum~fixed.acidity + volatile.acidity + residual.sugar +
chlorides + pH + total.sulfur.dioxide + alcohol +
citric.acid*type + sulphates*type + total.sulfur.dioxide*type,
family = binomial(link = "logit"), data = train)
summary(train.model)
##
## Call:
## glm(formula = quality.dum ~ fixed.acidity + volatile.acidity +
## residual.sugar + chlorides + pH + total.sulfur.dioxide +
## alcohol + citric.acid * type + sulphates * type + total.sulfur.dioxide *
## type, family = binomial(link = "logit"), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0908 -0.6355 -0.3784 -0.1274 3.2449
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.459e+01 1.656e+00 -8.815 < 2e-16 ***
## fixed.acidity 6.228e-02 6.181e-02 1.008 0.313667
## volatile.acidity -3.271e+00 5.479e-01 -5.969 2.38e-09 ***
## residual.sugar 6.479e-02 1.399e-02 4.631 3.65e-06 ***
## chlorides -1.575e+01 3.962e+00 -3.977 6.99e-05 ***
## pH 9.783e-01 3.808e-01 2.569 0.010190 *
## total.sulfur.dioxide 4.784e-04 1.594e-03 0.300 0.764100
## alcohol 9.662e-01 5.632e-02 17.156 < 2e-16 ***
## citric.acid -1.607e-01 5.796e-01 -0.277 0.781625
## type -1.992e+00 7.399e-01 -2.693 0.007084 **
## sulphates 1.003e+00 4.457e-01 2.251 0.024389 *
## citric.acid:type 1.227e+00 8.894e-01 1.379 0.167744
## type:sulphates 3.423e+00 9.742e-01 3.514 0.000441 ***
## total.sulfur.dioxide:type -8.582e-03 4.391e-03 -1.954 0.050664 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3301.0 on 3247 degrees of freedom
## Residual deviance: 2573.1 on 3234 degrees of freedom
## AIC: 2601.1
##
## Number of Fisher Scoring iterations: 6
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
pred.model <- predict.glm(train.model, test, type='response')
newpred <- prediction(pred.model, test$quality.dum)
newpred.performance <- performance(newpred, measure = "tpr",x.measure = "fpr")
plot(newpred.performance)
abline(a=0, b= 1)
AUC <- performance(newpred, measure = "auc")
AUC # Correctly predicted 82%
## An object of class "performance"
## Slot "x.name":
## [1] "None"
##
## Slot "y.name":
## [1] "Area under the ROC curve"
##
## Slot "alpha.name":
## [1] "none"
##
## Slot "x.values":
## list()
##
## Slot "y.values":
## [[1]]
## [1] 0.7975164
##
##
## Slot "alpha.values":
## list()
Need to manually create odds ratio for interaction terms. The log odds are not very useful as the scales are not inuitive (unless perhaps you are a chemist). Instead, we will illustrate change using predicted probabilities.
### Create Log odds ###
alcohol <- summary(model)$coefficients[8]
residual.sugar <- summary(model)$coefficients[4]
white.citric.acid.coef <- summary(model)$coefficients[9]
red.citric.acid.coef <- summary(model)$coefficients[9]+summary(model)$coefficients[12]
white.sulphates.coef <- summary(model)$coefficients[11]
red.sulphates.coef <- summary(model)$coefficients[11]+summary(model)$coefficients[13]
white.total.sulfur.dioxide.coef <- summary(model)$coefficients[7]
red.total.sulfur.dioxide.coef <- summary(model)$coefficients[7]+summary(model)$coefficients[14]
### Log odds
paste("Alcohol", round(exp(alcohol),2))
## [1] "Alcohol 2.45"
paste("Residual Sugar", round(residual.sugar,2))
## [1] "Residual Sugar 0.06"
paste("White wine: Citric Acid", round(exp(white.citric.acid.coef),2))
## [1] "White wine: Citric Acid 0.37"
paste("Red wine: Citric Acid", round(exp(red.citric.acid.coef),2))
## [1] "Red wine: Citric Acid 2.64"
paste("White wine: Sulphates", round(exp(white.sulphates.coef),2))
## [1] "White wine: Sulphates 3.72"
paste("Red wine: Sulphates", round(exp(red.sulphates.coef),2))
## [1] "Red wine: Sulphates 31.93"
paste("White wine: SO2", round(exp(white.total.sulfur.dioxide.coef),2))
## [1] "White wine: SO2 1"
paste("Red wine: SO2", round(exp(red.total.sulfur.dioxide.coef),2))
## [1] "Red wine: SO2 0.99"
Generating predictions as variables move from their 20th percentile to their 80th percentile. While such a change in variable is uncommon, it is much more likely than a variable mvoing from their minimum to their maximum.
Predictions 1) Alcohol: predict quality of wine as alcohol moves from 9.4% to 11.5% alcohol content and everything else, including type of wine, rest at their mean. 2) Residual Sugar: predict quality of wine as sugar moves from 1.7 g/dm3 to 9.6 g/dm3 and everything else, including type of wine, rest at their mean. 3) Red Wine Interactions: predict quality of red wine as SO2 moves from 165.0 to 62.2 mg/dm3 and Sulphates move from 0.41 g(potassium sulphate)/dm3 to 0.63 g(potassium sulphate)/dm3 and everything else, except type of wine which is red, rest at their mean. We also predict the same relationship with white wine to show that it has a smaller impact. 4) White Wine Interactions: predict quality of white wine as citric acid moves from 0.23 to 0.42 mg/dm3 and everything else, except type of wine which is white, rest at their mean. We also predict the same relationship with red wine to show that it has a smaller impact. 5) Top Six for red wine: predict quality of red wine based on high alcohol, sugar, and sulphates, and low total SO2, Chlorides, and Volatile Acidity, with the rest at its mean.
We created predicted probabilities that show that as alcohol content moves from its 20th percentile of 9.4% to its 80th percentile of 11.5%, the likelihood of that wine resulting in high quality wine moves from 0.04 to 0.21. For residual sugar, a similar change increases the likelihood from 0.08 to 0.12. The red wine interactions (sulphates and total SO2) also make large increases in probability, but the white wine interactions (citric acid) do not make as much change. Using the top six variables at their 80th percentile (or 20th percentile if they are negative), we can reach a predicted probability as high as 0.5. In other words, it’s extremely difficult to make high quality wine more likely than low quality wine, based on how we defined high quality wine.
### Predicted Probabilities ###
model <- glm(quality.dum~fixed.acidity + volatile.acidity + residual.sugar +
chlorides + pH + total.sulfur.dioxide + alcohol +
citric.acid*type + sulphates*type + total.sulfur.dioxide*type,
family = binomial(link = "logit"), data = wine_df)
summary(model)
##
## Call:
## glm(formula = quality.dum ~ fixed.acidity + volatile.acidity +
## residual.sugar + chlorides + pH + total.sulfur.dioxide +
## alcohol + citric.acid * type + sulphates * type + total.sulfur.dioxide *
## type, family = binomial(link = "logit"), data = wine_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3865 -0.6408 -0.3825 -0.1345 3.2357
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.358e+01 1.164e+00 -11.669 < 2e-16 ***
## fixed.acidity 6.572e-02 4.385e-02 1.499 0.133899
## volatile.acidity -3.911e+00 3.931e-01 -9.949 < 2e-16 ***
## residual.sugar 5.862e-02 9.379e-03 6.250 4.10e-10 ***
## chlorides -1.349e+01 2.712e+00 -4.972 6.62e-07 ***
## pH 9.693e-01 2.710e-01 3.576 0.000349 ***
## total.sulfur.dioxide -1.993e-05 1.114e-03 -0.018 0.985728
## alcohol 8.958e-01 3.861e-02 23.199 < 2e-16 ***
## citric.acid -9.909e-01 3.891e-01 -2.547 0.010878 *
## type -1.210e+00 4.772e-01 -2.537 0.011193 *
## sulphates 1.313e+00 3.190e-01 4.116 3.86e-05 ***
## citric.acid:type 1.963e+00 6.413e-01 3.060 0.002210 **
## type:sulphates 2.151e+00 6.108e-01 3.521 0.000430 ***
## total.sulfur.dioxide:type -1.375e-02 3.714e-03 -3.702 0.000214 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6439.6 on 6496 degrees of freedom
## Residual deviance: 5107.5 on 6483 degrees of freedom
## AIC: 5135.5
##
## Number of Fisher Scoring iterations: 6
### Alcohol ###
quantile(wine_df$alcohol, c(.2, .8))
## 20% 80%
## 9.4 11.5
# High Alcohol
highalcohol.pp <- data.frame(type=mean(wine_df$type), alcohol=quantile(wine_df$alcohol, c(.8)),
total.sulfur.dioxide=mean(wine_df$total.sulfur.dioxide),
chlorides=mean(wine_df$chlorides), fixed.acidity=mean(wine_df$fixed.acidity),
volatile.acidity=mean(wine_df$volatile.acidity),
residual.sugar=mean(wine_df$residual.sugar),
free.sulfur.dioxide=mean(wine_df$free.sulfur.dioxide),
total.sulfur.dioxide=mean(wine_df$total.sulfur.dioxide), pH=mean(wine_df$pH),
sulphates=mean(wine_df$sulphates), citric.acid=mean(wine_df$citric.acid))
highalcohol.pp.pred <- predict(model, highalcohol.pp, type='response')
# Low Alcohol
lowalcohol.pp <- data.frame(type=mean(wine_df$type), alcohol=quantile(wine_df$alcohol, c(.2)),
total.sulfur.dioxide=mean(wine_df$total.sulfur.dioxide),
chlorides=mean(wine_df$chlorides), fixed.acidity=mean(wine_df$fixed.acidity),
volatile.acidity=mean(wine_df$volatile.acidity),
residual.sugar=mean(wine_df$residual.sugar),
free.sulfur.dioxide=mean(wine_df$free.sulfur.dioxide),
total.sulfur.dioxide=mean(wine_df$total.sulfur.dioxide), pH=mean(wine_df$pH),
sulphates=mean(wine_df$sulphates), citric.acid=mean(wine_df$citric.acid))
lowalcohol.pp.pred <- predict(model, lowalcohol.pp, type='response')
paste("Wine with the max alcohol: ", round(highalcohol.pp.pred*100,0))
## [1] "Wine with the max alcohol: 21"
paste("Wine with the min alcohol: ", round(lowalcohol.pp.pred*100,0))
## [1] "Wine with the min alcohol: 4"
### Residual Sugar ###
quantile(wine_df$residual.sugar, c(.2, .8))
## 20% 80%
## 1.7 9.6
# High sugar
highsugar.pp <- data.frame(type=mean(wine_df$type), residual.sugar=quantile(wine_df$residual.sugar, c(.8)),
alcohol=mean(wine_df$alcohol), total.sulfur.dioxide=mean(wine_df$total.sulfur.dioxide),
chlorides=mean(wine_df$chlorides), fixed.acidity=mean(wine_df$fixed.acidity),
volatile.acidity=mean(wine_df$volatile.acidity),
residual.sugar=mean(wine_df$residual.sugar),
free.sulfur.dioxide=mean(wine_df$free.sulfur.dioxide),
total.sulfur.dioxide=mean(wine_df$total.sulfur.dioxide), pH=mean(wine_df$pH),
sulphates=mean(wine_df$sulphates), citric.acid=mean(wine_df$citric.acid))
highsugar.pp.pred <- predict(model, highsugar.pp, type='response')
# Low sugar
lowsugar.pp <- data.frame(type=mean(wine_df$type), residual.sugar=quantile(wine_df$residual.sugar, c(.2)),
alcohol=mean(wine_df$alcohol), total.sulfur.dioxide=mean(wine_df$total.sulfur.dioxide),
chlorides=mean(wine_df$chlorides), fixed.acidity=mean(wine_df$fixed.acidity),
volatile.acidity=mean(wine_df$volatile.acidity),
free.sulfur.dioxide=mean(wine_df$free.sulfur.dioxide),
total.sulfur.dioxide=mean(wine_df$total.sulfur.dioxide), pH=mean(wine_df$pH),
sulphates=mean(wine_df$sulphates), citric.acid=mean(wine_df$citric.acid))
lowsugar.pp.pred <- predict(model, lowsugar.pp, type='response')
paste("Wine with the max alcohol: ", round(highsugar.pp.pred*100,0))
## [1] "Wine with the max alcohol: 12"
paste("Wine with the min alcohol: ", round(lowsugar.pp.pred*100,0))
## [1] "Wine with the min alcohol: 8"
### Interactions ###
### Red Interactions ###
quantile(wine_df$total.sulfur.dioxide, c(.2, .8))
## 20% 80%
## 62.2 165.0
quantile(wine_df$sulphates, c(.2, .8))
## 20% 80%
## 0.41 0.63
# Low SO2 and High Sulphates
red.bestred.pp <- data.frame(type=1, total.sulfur.dioxide=quantile(wine_df$total.sulfur.dioxide, c(.2)),
sulphates=quantile(wine_df$sulphates, c(.8)),
chlorides=mean(wine_df$chlorides), fixed.acidity=mean(wine_df$fixed.acidity),
volatile.acidity=mean(wine_df$volatile.acidity),
residual.sugar=mean(wine_df$residual.sugar),
free.sulfur.dioxide=mean(wine_df$free.sulfur.dioxide),
pH=mean(wine_df$pH), alcohol=mean(wine_df$alcohol), citric.acid=mean(wine_df$citric.acid))
red.bestred.pp.pred <- predict(model, red.bestred.pp, type='response')
# High SO2 and Low Sulphates
red.worstred.pp <- data.frame(type=1, total.sulfur.dioxide=quantile(wine_df$total.sulfur.dioxide, c(.8)),
sulphates=quantile(wine_df$sulphates, c(.2)),
chlorides=mean(wine_df$chlorides), fixed.acidity=mean(wine_df$fixed.acidity),
volatile.acidity=mean(wine_df$volatile.acidity),
residual.sugar=mean(wine_df$residual.sugar),
free.sulfur.dioxide=mean(wine_df$free.sulfur.dioxide),
pH=mean(wine_df$pH), alcohol=mean(wine_df$alcohol), citric.acid=mean(wine_df$citric.acid))
red.worstred.pp.pred <- predict(model, red.worstred.pp, type='response')
paste("Red Wine with the max sulphates and min SO2: ", round(red.bestred.pp.pred*100,0))
## [1] "Red Wine with the max sulphates and min SO2: 13"
paste("Red Wine with the min sulphates and max SO2: ", round(red.worstred.pp.pred*100,0))
## [1] "Red Wine with the min sulphates and max SO2: 2"
# Compare results to the white wine equivalent
# Low SO2 and High Sulphates
white.bestwhite.pp <- data.frame(type=0, total.sulfur.dioxide=quantile(wine_df$total.sulfur.dioxide, c(.2)),
sulphates=quantile(wine_df$sulphates, c(.8)),
chlorides=mean(wine_df$chlorides), fixed.acidity=mean(wine_df$fixed.acidity),
volatile.acidity=mean(wine_df$volatile.acidity),
residual.sugar=mean(wine_df$residual.sugar),
free.sulfur.dioxide=mean(wine_df$free.sulfur.dioxide),
pH=mean(wine_df$pH), alcohol=mean(wine_df$alcohol), citric.acid=mean(wine_df$citric.acid))
white.bestwhite.pp.pred <- predict(model, white.bestwhite.pp, type='response')
# High SO2 and Low Sulphates
white.worstwhite.pp <- data.frame(type=0, total.sulfur.dioxide=quantile(wine_df$total.sulfur.dioxide, c(.8)),
sulphates=quantile(wine_df$sulphates, c(.2)),
chlorides=mean(wine_df$chlorides), fixed.acidity=mean(wine_df$fixed.acidity),
volatile.acidity=mean(wine_df$volatile.acidity),
residual.sugar=mean(wine_df$residual.sugar),
free.sulfur.dioxide=mean(wine_df$free.sulfur.dioxide),
pH=mean(wine_df$pH), alcohol=mean(wine_df$alcohol),
citric.acid=mean(wine_df$citric.acid))
white.worstwhite.pp.pred <- predict(model, white.worstwhite.pp, type='response')
paste("White Wine with the max sulphates and min SO2: ", round(white.bestwhite.pp.pred*100,0))
## [1] "White Wine with the max sulphates and min SO2: 14"
paste("White Wine with the min sulphates and max SO2: ", round(white.worstwhite.pp.pred*100,0))
## [1] "White Wine with the min sulphates and max SO2: 11"
### White Interactions ###
quantile(wine_df$citric.acid, c(.2, .8))
## 20% 80%
## 0.23 0.42
# Low Citric Acid
white.bestwhite.pp <- data.frame(type=0, citric.acid=quantile(wine_df$citric.acid, c(.2)),
total.sulfur.dioxide=mean(wine_df$total.sulfur.dioxide),
sulphates=mean(wine_df$sulphates),
chlorides=mean(wine_df$chlorides), fixed.acidity=mean(wine_df$fixed.acidity),
volatile.acidity=mean(wine_df$volatile.acidity),
residual.sugar=mean(wine_df$residual.sugar),
free.sulfur.dioxide=mean(wine_df$free.sulfur.dioxide),
pH=mean(wine_df$pH), alcohol=mean(wine_df$alcohol))
white.bestwhite.pp.pred <- predict(model, white.bestwhite.pp, type='response')
# High Citric Acid
white.worstwhite.pp <- data.frame(type=0, citric.acid=quantile(wine_df$citric.acid, c(.8)),
total.sulfur.dioxide=mean(wine_df$total.sulfur.dioxide),
sulphates=mean(wine_df$sulphates),
chlorides=mean(wine_df$chlorides), fixed.acidity=mean(wine_df$fixed.acidity),
volatile.acidity=mean(wine_df$volatile.acidity),
residual.sugar=mean(wine_df$residual.sugar),
free.sulfur.dioxide=mean(wine_df$free.sulfur.dioxide),
pH=mean(wine_df$pH), alcohol=mean(wine_df$alcohol))
white.worstwhite.pp.pred <- predict(model, white.worstwhite.pp, type='response')
paste("White Wine with the min Citric Acid: ", round(white.bestwhite.pp.pred*100,0))
## [1] "White Wine with the min Citric Acid: 13"
paste("White Wine with the max Citric Acid: ", round(white.worstwhite.pp.pred*100,0))
## [1] "White Wine with the max Citric Acid: 11"
# Red Wine equivalent
# Low Citric Acid
red.bestwhite.pp <- data.frame(type=1, citric.acid=quantile(wine_df$citric.acid, c(.2)),
total.sulfur.dioxide=mean(wine_df$total.sulfur.dioxide), sulphates=mean(wine_df$sulphates),
chlorides=mean(wine_df$chlorides), fixed.acidity=mean(wine_df$fixed.acidity),
volatile.acidity=mean(wine_df$volatile.acidity),
residual.sugar=mean(wine_df$residual.sugar),
free.sulfur.dioxide=mean(wine_df$free.sulfur.dioxide),
pH=mean(wine_df$pH), alcohol=mean(wine_df$alcohol))
red.bestwhite.pp.pred <- predict(model, red.bestwhite.pp, type='response')
# High Citric Acid
red.worstwhite.pp <- data.frame(type=1, citric.acid=quantile(wine_df$citric.acid, c(.8)),
total.sulfur.dioxide=mean(wine_df$total.sulfur.dioxide), sulphates=mean(wine_df$sulphates),
chlorides=mean(wine_df$chlorides), fixed.acidity=mean(wine_df$fixed.acidity),
volatile.acidity=mean(wine_df$volatile.acidity),
residual.sugar=mean(wine_df$residual.sugar),
free.sulfur.dioxide=mean(wine_df$free.sulfur.dioxide),
pH=mean(wine_df$pH), alcohol=mean(wine_df$alcohol))
red.worstwhite.pp.pred <- predict(model, red.worstwhite.pp, type='response')
paste("White Wine with the min Citric Acid: ", round(red.bestwhite.pp.pred*100,0))
## [1] "White Wine with the min Citric Acid: 4"
paste("White Wine with the max Citric Acid: ", round(red.worstwhite.pp.pred*100,0))
## [1] "White Wine with the max Citric Acid: 5"
### Best wine ###
# High alcohol, sugar, and sulphates, and low total SO2, Chlorides, and Volatile Acidity, and red wine
bestwine.pp <- data.frame(type=1, alcohol=quantile(wine_df$alcohol, c(.8)),
total.sulfur.dioxide=quantile(wine_df$total.sulfur.dioxide, c(.2)),
chlorides=quantile(wine_df$chlorides, c(.2)), fixed.acidity=mean(wine_df$fixed.acidity),
volatile.acidity=quantile(wine_df$volatile.acidity, c(.2)),
residual.sugar=quantile(wine_df$residual.sugar, c(.8)),
free.sulfur.dioxide=mean(wine_df$free.sulfur.dioxide),
total.sulfur.dioxide=mean(wine_df$total.sulfur.dioxide), pH=mean(wine_df$pH),
sulphates=quantile(wine_df$sulphates, c(.8)), citric.acid=mean(wine_df$citric.acid))
bestwine.pp.pred <- predict(model, bestwine.pp, type='response')
# Low alcohol, sugar, and sulphates, and high total SO2, Chlorides, and Volatile Acidity, and red wine
worsewine.pp <- data.frame(type=1, alcohol=quantile(wine_df$alcohol, c(.2)),
total.sulfur.dioxide=quantile(wine_df$total.sulfur.dioxide, c(.8)),
chlorides=quantile(wine_df$chlorides, c(.8)), fixed.acidity=mean(wine_df$fixed.acidity),
volatile.acidity=quantile(wine_df$volatile.acidity, c(.8)),
residual.sugar=quantile(wine_df$residual.sugar, c(.2)),
free.sulfur.dioxide=mean(wine_df$free.sulfur.dioxide),
total.sulfur.dioxide=mean(wine_df$total.sulfur.dioxide), pH=mean(wine_df$pH),
sulphates=quantile(wine_df$sulphates, c(.2)), citric.acid=mean(wine_df$citric.acid))
worsewine.pp.pred <- predict(model, worsewine.pp, type='response')
paste("Red wine with the top six best traits: ", round(bestwine.pp.pred*100,0))
## [1] "Red wine with the top six best traits: 50"
paste("Red wine with the top six worst traits: ", round(worsewine.pp.pred*100,2))
## [1] "Red wine with the top six worst traits: 0.26"