# Load all of the packages that you end up using
# in your analysis in this code chunk.
# Notice that the parameter "echo" was set to FALSE for this code chunk.
# This prevents the code from displaying in the knitted HTML output.
# You should set echo=FALSE for all code chunks in your file.
#install.packages("ggplot2", dependencies = T,repos = 'http://cran.us.r-project.org')
#install.packages("knitr", dependencies = T,repos = 'http://cran.us.r-project.org')#
#install.packages("dplyr", dependencies = T,repos = 'http://cran.us.r-project.org')#
#install.packages('Rcpp', dependencies = TRUE)#
library(ggplot2)
library(knitr)
library(GGally)
# Load the Data
redWine <- read.csv('wineQualityReds.csv', header = TRUE)
dim(redWine)
## [1] 1599 13
colnames(redWine)
## [1] "X" "fixed.acidity" "volatile.acidity"
## [4] "citric.acid" "residual.sugar" "chlorides"
## [7] "free.sulfur.dioxide" "total.sulfur.dioxide" "density"
## [10] "pH" "sulphates" "alcohol"
## [13] "quality"
summary(redWine)
## X fixed.acidity volatile.acidity citric.acid
## Min. : 1.0 Min. : 4.60 Min. :0.1200 Min. :0.000
## 1st Qu.: 400.5 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090
## Median : 800.0 Median : 7.90 Median :0.5200 Median :0.260
## Mean : 800.0 Mean : 8.32 Mean :0.5278 Mean :0.271
## 3rd Qu.:1199.5 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420
## Max. :1599.0 Max. :15.90 Max. :1.5800 Max. :1.000
## residual.sugar chlorides free.sulfur.dioxide
## Min. : 0.900 Min. :0.01200 Min. : 1.00
## 1st Qu.: 1.900 1st Qu.:0.07000 1st Qu.: 7.00
## Median : 2.200 Median :0.07900 Median :14.00
## Mean : 2.539 Mean :0.08747 Mean :15.87
## 3rd Qu.: 2.600 3rd Qu.:0.09000 3rd Qu.:21.00
## Max. :15.500 Max. :0.61100 Max. :72.00
## total.sulfur.dioxide density pH sulphates
## Min. : 6.00 Min. :0.9901 Min. :2.740 Min. :0.3300
## 1st Qu.: 22.00 1st Qu.:0.9956 1st Qu.:3.210 1st Qu.:0.5500
## Median : 38.00 Median :0.9968 Median :3.310 Median :0.6200
## Mean : 46.47 Mean :0.9967 Mean :3.311 Mean :0.6581
## 3rd Qu.: 62.00 3rd Qu.:0.9978 3rd Qu.:3.400 3rd Qu.:0.7300
## Max. :289.00 Max. :1.0037 Max. :4.010 Max. :2.0000
## alcohol quality
## Min. : 8.40 Min. :3.000
## 1st Qu.: 9.50 1st Qu.:5.000
## Median :10.20 Median :6.000
## Mean :10.42 Mean :5.636
## 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :14.90 Max. :8.000
str(redWine)
## 'data.frame': 1599 obs. of 13 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
head(redWine)
## X fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 1 7.4 0.70 0.00 1.9 0.076
## 2 2 7.8 0.88 0.00 2.6 0.098
## 3 3 7.8 0.76 0.04 2.3 0.092
## 4 4 11.2 0.28 0.56 1.9 0.075
## 5 5 7.4 0.70 0.00 1.9 0.076
## 6 6 7.4 0.66 0.00 1.8 0.075
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.20 0.68 9.8
## 3 15 54 0.9970 3.26 0.65 9.8
## 4 17 60 0.9980 3.16 0.58 9.8
## 5 11 34 0.9978 3.51 0.56 9.4
## 6 13 40 0.9978 3.51 0.56 9.4
## quality
## 1 5
## 2 5
## 3 5
## 4 6
## 5 5
## 6 5
summary(redWine$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.636 6.000 8.000
qplot(x = alcohol,data = redWine)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Alcohol percentage falls mostly between 9 and 10. Median red wine quality rating is 6 with a max of 8 and a low of 3. Ratings, in theory, could have been anywhere from 0-10. It is interesting to note that no wines were rated very highly or lowly.
qplot(x = alcohol,data = redWine)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Most wines have an alcohol percentage between 9 and 11 percent. There is a large amount of wines between 9-10.
qplot(x = alcohol,data = redWine) + facet_wrap(~quality, ncol = 3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Separating alcohol contents of wines by quality rating, it appears that the lower rated wines of 3 and 4 have lower alcohol content while the highest rated wine of 8 tends towards more alcohol. Wines with a rating of 5 have a large amount with alcohol percentages between 9-10. I wonder why wines with a rating of 5 have such a similar amount of alcohol content. Perhaps 9 is a good percentage for an entry-level, accessible, average-tasting wine?
qplot(x=volatile.acidity, data = redWine) + facet_wrap(~quality, ncol=3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Volatile acidity, when largely present in wine, creates a bad taste. No surprise that the highest level acidity occurs in the lowest rated wine, 3. And as expected most wines have a low amount of volatile acidity. There appears to be an inverse relationship between volatile acidity and quality.
qplot(x = citric.acid,data = redWine) + facet_wrap(~quality, ncol=3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Citric acid adds “freshness” to wines. It does not seem to have much effect on taste. Low quality wines and mid-level wines have a similar amount.
qplot(x = residual.sugar ,data = redWine)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Residual sugar lies mostly between 2 and 3.
qplot(x = residual.sugar ,data = redWine, xlim=c(0,2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 979 rows containing non-finite values (stat_bin).
Per the data notes, it is very rare to find a wine with less than one gram residual sugar, which is apparent in the data set. Wines with over 40 grams are considered sweet but no wine in the data set is that high.
qplot(x =quality, data = redWine)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
goodWines = subset(redWine, redWine$quality == 8)
goodWines
## X fixed.acidity volatile.acidity citric.acid residual.sugar
## 268 268 7.9 0.35 0.46 3.6
## 279 279 10.3 0.32 0.45 6.4
## 391 391 5.6 0.85 0.05 1.4
## 441 441 12.6 0.31 0.72 2.2
## 456 456 11.3 0.62 0.67 5.2
## 482 482 9.4 0.30 0.56 2.8
## 496 496 10.7 0.35 0.53 2.6
## 499 499 10.7 0.35 0.53 2.6
## 589 589 5.0 0.42 0.24 2.0
## 829 829 7.8 0.57 0.09 2.3
## 1062 1062 9.1 0.40 0.50 1.8
## 1091 1091 10.0 0.26 0.54 1.9
## 1121 1121 7.9 0.54 0.34 2.5
## 1203 1203 8.6 0.42 0.39 1.8
## 1270 1270 5.5 0.49 0.03 1.8
## 1404 1404 7.2 0.33 0.33 1.7
## 1450 1450 7.2 0.38 0.31 2.0
## 1550 1550 7.4 0.36 0.30 1.8
## chlorides free.sulfur.dioxide total.sulfur.dioxide density pH
## 268 0.078 15 37 0.99730 3.35
## 279 0.073 5 13 0.99760 3.23
## 391 0.045 12 88 0.99240 3.56
## 441 0.072 6 29 0.99870 2.88
## 456 0.086 6 19 0.99880 3.22
## 482 0.080 6 17 0.99640 3.15
## 496 0.070 5 16 0.99720 3.15
## 499 0.070 5 16 0.99720 3.15
## 589 0.060 19 50 0.99170 3.72
## 829 0.065 34 45 0.99417 3.46
## 1062 0.071 7 16 0.99462 3.21
## 1091 0.083 42 74 0.99451 2.98
## 1121 0.076 8 17 0.99235 3.20
## 1203 0.068 6 12 0.99516 3.35
## 1270 0.044 28 87 0.99080 3.50
## 1404 0.061 3 13 0.99600 3.23
## 1450 0.056 15 29 0.99472 3.23
## 1550 0.074 17 24 0.99419 3.24
## sulphates alcohol quality
## 268 0.86 12.8 8
## 279 0.82 12.6 8
## 391 0.82 12.9 8
## 441 0.82 9.8 8
## 456 0.69 13.4 8
## 482 0.92 11.7 8
## 496 0.65 11.0 8
## 499 0.65 11.0 8
## 589 0.74 14.0 8
## 829 0.74 12.7 8
## 1062 0.69 12.5 8
## 1091 0.63 11.8 8
## 1121 0.72 13.1 8
## 1203 0.69 11.7 8
## 1270 0.82 14.0 8
## 1404 1.10 10.0 8
## 1450 0.76 11.3 8
## 1550 0.70 11.4 8
Most wine is rated a 5 or 6 out of 10. The highest rating being 8 and the lowest 3. Curiously, no wine sampled got really bad (0,1,2) or really good (9,10) ratings.
qplot(x=total.sulfur.dioxide, data = redWine, xlim=c(0,180))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bin).
Total sulfur dioxide is mostly under 50. Over 50 would have some effect on taste according to the notes.
qplot(x=total.sulfur.dioxide, data = redWine, xlim=c(0,180)) + facet_wrap(~quality, ncol=3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bin).
The lowest rated wine in the data set is mostly under 50. The rest of the wines however are more disparate. It appears that most wines, regardless of rating, tend to be under 50. Total sulfur dioxide does not likely have much effect on rating.
qplot(x=chlorides, data = redWine, binwidth = 0.010, xlim=c(0,0.2))
## Warning: Removed 41 rows containing non-finite values (stat_bin).
Most chloride (amount of salt) falls between 0.05 and 0.10.
qplot(x=chlorides, data = redWine, binwidth = 0.010, xlim=c(0,0.2)) + facet_wrap(~quality, ncol=3)
## Warning: Removed 41 rows containing non-finite values (stat_bin).
Chlorides appear to be similar across rating levels. It likely has little effect on taste.
qplot(x =pH, data = redWine)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Most wine have a pH between 3 and 4.
qplot(x =pH, data = redWine) + facet_wrap(~quality, ncol=3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Again, another variable when looking across quality ratings there appears to be no difference that can be attributed to pH.
qplot(x=sulphates, data = redWine)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Most sulfate content is between 0.5 and 1.
qplot(x=sulphates, data = redWine) + facet_wrap(~quality, ncol=3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
When looking at sulphate content by rating there appears to be little effect. The lowest and highest rated wines tend to have less variability.
There are 1599 red wines with 13 variables considered, fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, and quality. The one factor in this data set is quality which ranges from 1-10 (1 the lowest, 10 the best). Interestingly, no wine scores below 3 or above 8. The median wine rating is 6. Alcohol percentage, which is the variable likely most familiar to readers, ranges from a minimum of 8.4 to 14.9 percent. The median is 10.42 percent.
The main feature in the data set is quality. The goal of this study is to understand what are the relevant factors in determining the quality, as judged by taste, in red wine.
I suspect alcohol percentage will play a big role in quality of the wine. Also, volatile acidity (too much gives a vinegar like taste) and chlorides (saltiness) will also be two important factors in understanding wine quality.
No, at the moment I do not see a need to create a new variable. It is very uncertain at this point to tell what variables will be important in the analysis. As the collectors of the data set note no one knows what factors prior to analysis will be important.
Citric acid has a huge bin near 0. I changed the xlim from 0 to 0.1 (originally a range of 0 to 1) and create extremely small bins of 0.025. It appears most wines have an almost non-existent amount of citric acid.
cor(redWine)
## X fixed.acidity volatile.acidity
## X 1.000000000 -0.26848392 -0.008815099
## fixed.acidity -0.268483920 1.00000000 -0.256130895
## volatile.acidity -0.008815099 -0.25613089 1.000000000
## citric.acid -0.153551355 0.67170343 -0.552495685
## residual.sugar -0.031260835 0.11477672 0.001917882
## chlorides -0.119868519 0.09370519 0.061297772
## free.sulfur.dioxide 0.090479643 -0.15379419 -0.010503827
## total.sulfur.dioxide -0.117849669 -0.11318144 0.076470005
## density -0.368372087 0.66804729 0.022026232
## pH 0.136005328 -0.68297819 0.234937294
## sulphates -0.125306999 0.18300566 -0.260986685
## alcohol 0.245122841 -0.06166827 -0.202288027
## quality 0.066452608 0.12405165 -0.390557780
## citric.acid residual.sugar chlorides
## X -0.15355136 -0.031260835 -0.119868519
## fixed.acidity 0.67170343 0.114776724 0.093705186
## volatile.acidity -0.55249568 0.001917882 0.061297772
## citric.acid 1.00000000 0.143577162 0.203822914
## residual.sugar 0.14357716 1.000000000 0.055609535
## chlorides 0.20382291 0.055609535 1.000000000
## free.sulfur.dioxide -0.06097813 0.187048995 0.005562147
## total.sulfur.dioxide 0.03553302 0.203027882 0.047400468
## density 0.36494718 0.355283371 0.200632327
## pH -0.54190414 -0.085652422 -0.265026131
## sulphates 0.31277004 0.005527121 0.371260481
## alcohol 0.10990325 0.042075437 -0.221140545
## quality 0.22637251 0.013731637 -0.128906560
## free.sulfur.dioxide total.sulfur.dioxide density
## X 0.090479643 -0.11784967 -0.36837209
## fixed.acidity -0.153794193 -0.11318144 0.66804729
## volatile.acidity -0.010503827 0.07647000 0.02202623
## citric.acid -0.060978129 0.03553302 0.36494718
## residual.sugar 0.187048995 0.20302788 0.35528337
## chlorides 0.005562147 0.04740047 0.20063233
## free.sulfur.dioxide 1.000000000 0.66766645 -0.02194583
## total.sulfur.dioxide 0.667666450 1.00000000 0.07126948
## density -0.021945831 0.07126948 1.00000000
## pH 0.070377499 -0.06649456 -0.34169933
## sulphates 0.051657572 0.04294684 0.14850641
## alcohol -0.069408354 -0.20565394 -0.49617977
## quality -0.050656057 -0.18510029 -0.17491923
## pH sulphates alcohol quality
## X 0.13600533 -0.125306999 0.24512284 0.06645261
## fixed.acidity -0.68297819 0.183005664 -0.06166827 0.12405165
## volatile.acidity 0.23493729 -0.260986685 -0.20228803 -0.39055778
## citric.acid -0.54190414 0.312770044 0.10990325 0.22637251
## residual.sugar -0.08565242 0.005527121 0.04207544 0.01373164
## chlorides -0.26502613 0.371260481 -0.22114054 -0.12890656
## free.sulfur.dioxide 0.07037750 0.051657572 -0.06940835 -0.05065606
## total.sulfur.dioxide -0.06649456 0.042946836 -0.20565394 -0.18510029
## density -0.34169933 0.148506412 -0.49617977 -0.17491923
## pH 1.00000000 -0.196647602 0.20563251 -0.05773139
## sulphates -0.19664760 1.000000000 0.09359475 0.25139708
## alcohol 0.20563251 0.093594750 1.00000000 0.47616632
## quality -0.05773139 0.251397079 0.47616632 1.00000000
Volatile acidity has a somewhat strong negative relationship with quality rating. Alcohol content has a somewhat strong positive relationship with quality rating. The other variables are not as strong.
ggpairs(redWine)
ggplot(redWine, aes(factor(quality),alcohol, fill = factor(quality))) + geom_boxplot() + coord_flip()
High quality wines tend to have a higher alcohol percentage, average wines an average amount of alcohol, and the worst wines have lower alcohol. The variability on a wine rating of 6 however is surprising. A wine rated 6 goes as low as alcohol levels can go in the data. Also, wines rated 5 or 6 have many outliers. Wines rated 3 have the least amount of variability with the smallest IQR.
ggplot(redWine, aes(factor(quality),volatile.acidity, fill = factor(quality))) + geom_boxplot() + coord_flip()
As was seen in the correlation matrix above, higher volatile acidity is negatively related with quality. The lowest rated wines have much more than the higher rated wines. Among wines rated 3, the lowest rating, the biggest outlier for volatile acidity occurs.
lmQualVo = lm(redWine$quality ~ redWine$volatile.acidity)
summary(lmQualVo)
##
## Call:
## lm(formula = redWine$quality ~ redWine$volatile.acidity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.79071 -0.54411 -0.00687 0.47350 2.93148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.56575 0.05791 113.39 <2e-16 ***
## redWine$volatile.acidity -1.76144 0.10389 -16.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7437 on 1597 degrees of freedom
## Multiple R-squared: 0.1525, Adjusted R-squared: 0.152
## F-statistic: 287.4 on 1 and 1597 DF, p-value: < 2.2e-16
Running a linear regression with volatile acidity as the independent variable and quality as the dependent variable shows an R^2 of only 0.15. A linear model is unlikely to be a great model for this relationship.
ggplot(redWine, aes(volatile.acidity, quality)) + geom_point() + geom_smooth(method = 'lm')
Here is a visualization of the model. Granted, ‘quality’ is not exactly a total numerical variable, which would make more sense to use a regression, however I think it is still a useful way to understand the data.
ggplot(redWine, aes(factor(quality),citric.acid, fill = factor(quality))) + geom_boxplot() + coord_flip()
Citric acid can add freshness to wine taste. There appears to be a slight positive relationship between citric acid and quality. However, the max/min citric acids for every quality rating run the full spectrum of the data set. Citric acid is likely not a huge factor in quality ratings.
lmQualAl = lm(redWine$quality ~ redWine$alcohol)
summary(lmQualAl)
##
## Call:
## lm(formula = redWine$quality ~ redWine$alcohol)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8442 -0.4112 -0.1690 0.5166 2.5888
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.87497 0.17471 10.73 <2e-16 ***
## redWine$alcohol 0.36084 0.01668 21.64 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7104 on 1597 degrees of freedom
## Multiple R-squared: 0.2267, Adjusted R-squared: 0.2263
## F-statistic: 468.3 on 1 and 1597 DF, p-value: < 2.2e-16
Linear model with alcohol as independent variable and quality as dependent variable. R^2 is 0.227, which is slightly higher than volatile acidity. Nonetheless, a linear model is not likely the best for the data set. But as mentioned before I think the visualization is helpful.
ggplot(redWine, aes(alcohol, quality)) + geom_point() + geom_smooth(method = 'lm')
Visual of linear model.
ggplot(redWine, aes(factor(quality),sulphates,fill=factor(quality))) + geom_boxplot() + coord_flip()
Sulphates appear to have a small positive effect on quality. The median is very similar across quality ratings. However, it is interesting to note that the variability is a lot higher for wine quality 5-7 and a lot less variable for wines 3 and 8.
ggplot(redWine, aes(pH, citric.acid)) + geom_point()
pH and citric acid appear negatively correlated. This makes sense as pH is less acidic as it goes up. I would be interested in adding ‘quality’ as a third variable to this graph later.
ggplot(redWine, aes(alcohol, sulphates)) + geom_point()
I created this graph to see how the two biggest positive factors in wine quality ratings matched up. There seems to be little relationship. However, lower alcohol percentages have somewhat more variability in sulphate content.
ggplot(redWine, aes(alcohol, citric.acid)) + geom_point(alpha = 1/5)
Alcohol is positively correlated with taste and so is citric acid. It is clear that low alcohol wines tend to have less citric acid in the graph (blacker dots indicate more concentration of data).
Alcohol appears to be have a significant effect on the quality ratings of red wine. Many of the highest rated wines had a higher alcohol percentage and many of the lowest had a lower alcohol rating. Not one lowest rated wine had an ABV higher than 12. Creating a linear model however, shows R^2 to be low. Other factors must be used to adequately explain red wine quality ratings.
Volatile acidity also is a factor in the quality ratings of red wine. Volatile acidity is negatively correlated with quality ratings. Too much of it gives wine a vinegar like taste.
Higher citric acid and sulphates also positively contribute to the quality of the red wine.
Lower alcohol wines tend to have smaller citric acid content. This is interesting because both positively contribute to a higher quality rating. It appears both work in tandem to contribute the quality, good or bad, of wine.
Sulphates are also positively related to wine ratings. Though the effect is not large, it is interesting to note that many low alcohol wines also have higher sulphate content despite that low alcohol wines tend to be rated lower.
pH and citric acid tend towards a negative correlation which is no surprise since by definition a higher pH means a lower acidity level. However the variation in pH is only between 3-4 in the data set. It is interesting to see how much the citric acid level fluctuates between just one step in the pH scale.
The strongest relationship with regards to wine taste was, positively, alcohol content (0.47) and negatively, volatile acidity (-0.39).
ggplot(redWine, aes(pH, citric.acid)) + geom_point(aes(colour = factor(quality)))
Citric acid and pH with quality colored in. It is difficult to say anything about this graph as the dots overlap too much. Let’s look at just ratings 3 and 8.
ggplot(subset(redWine, quality == 3 | quality == 8), aes(pH, citric.acid)) + geom_point(aes(colour = factor(quality)))
Wines rated 8 tend to have high citric acid and lower pH while wines rated 3 have high pH and low citric acid.
ggplot(redWine, aes(alcohol, citric.acid)) + geom_point(aes(colour = factor(quality)))
Here one can see that wines rated 6 and above all tend to have higher alcohol percentage. Citric acid does not appear to be as big a factor for wines rated 6,7,or 8.
ggplot(subset(redWine, quality == 3 | quality == 8), aes(alcohol, citric.acid)) + geom_point(aes(colour = factor(quality)))
Subset of highest and lowest rated wines. Low citric acid and lower alcohol content have a tendency for a lower rating.
ggplot(subset(redWine, quality == 5 | quality == 6), aes(alcohol, citric.acid)) + geom_point(aes(colour = factor(quality)))
Subset of the most common ratings, 5 and 6. Interestingly, there appears to be a huge difference between 5 and 6 regarding alcohol content. Very few wines that are rated 5 have alcohol content above 12. Citric acid does not appear to play as big a role between ratings 5 and 6.
ratings = factor(redWine$quality)
ggplot(redWine, aes(alcohol, colour = ratings, fill = ratings)) + geom_density(alpha = 0.1)
Overlapping graph of wines by alcohol and quality. Surprisingly, wines that were rated a 5 were likely to be around 9% while wines rated a 3 were likely to be around 10%. Wines with a rating of 5 have less variation in alcohol content than wines rated a 3. Also, wines with higher alcohol percentages, as mentioned before, tend to have higher ratings. Wines rated a 7 or 8 tend to have more alcohol content variability.
ggplot(redWine, aes(alcohol, volatile.acidity, colour = ratings)) + geom_point()
ggplot(subset(redWine, quality == 3 | quality == 8), aes(alcohol, volatile.acidity)) + geom_point(aes(colour = factor(quality)))
The best wines have higher alcohol and low volatile acidity, while the worst have lower alcohol and higher volatile acidity.
pH and citric acid were found to have a relationship with quality. High pH and low citric acidity, when looking at the highest and lowest rated wines, tend to be rated the lowest. Low pH and high citric acidity tend to be rated the highest possible in the data set. However, acids and bases are negatively related so no surprise this was found in the data set.