# Load all of the packages that you end up using
# in your analysis in this code chunk.
library(ggplot2)
library(lsr)
library(GGally)
library(RColorBrewer)
library(grid)
library(gridExtra)
library(ellipse)
library(MASS)
library(ellipse)
library(lattice)
library(memisc)
##
## Attaching package: 'memisc'
## The following objects are masked from 'package:stats':
##
## contrasts, contr.sum, contr.treatment
## The following object is masked from 'package:base':
##
## as.array
library(reshape2)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:memisc':
##
## percent
#function to create hist object
plot_hist_one_var <- function(x_str = "fixed.acidity", t, xl) {
ggplot( data = rw, aes_string(x = x_str)) +
geom_histogram(color="white", fill="#3366FF")+
ggtitle(t)+
xlab(xl)+
theme(plot.title = element_text(size=12))
}
#function to create box plot object
box_plot_func <- function(x_str = "quality.rank",
y_str, xl = "Quality rank",yl,t){
ggplot( data = rw, aes_string(x=x_str, y = y_str)) +
geom_boxplot()+
xlab(xl)+
ylab(yl)+
ggtitle(t)
}
#function to create sactterplot object
point_plot_func <- function( col = "quality.rank",
x_str, y_str, xl, yl, t, rvrs = T,
lt = "Quality rank" ){
ggplot( data = rw, aes_string(x=x_str, y = y_str, color=col)) +
geom_point(size=3, alpha=0.5, position="jitter")+
xlab(xl)+
ylab(yl)+
ggtitle(t)+
scale_color_brewer(palette="Spectral",
guide = guide_legend(title = lt,
reverse = rvrs))
}
#function to create density plot object
density_plot_func <- function(d = rw,x_str,col,xl,t){
ggplot( data = d, aes_string(x = x_str, color=col ))+
geom_density(size = 1)+
xlab(xl)+
ggtitle(t)
}
Main purpose of this data set analysis is figure out which chemical chracteristics have influence on red wine quality.
Explore the following parameters:
Output variable (based on sensory data): * quality (score between 0 and 10)
rw <- read.csv('/home/alex/ml/data/wineQualityReds.csv')
names(rw)
## [1] "X" "fixed.acidity" "volatile.acidity"
## [4] "citric.acid" "residual.sugar" "chlorides"
## [7] "free.sulfur.dioxide" "total.sulfur.dioxide" "density"
## [10] "pH" "sulphates" "alcohol"
## [13] "quality"
summary(rw)
## X fixed.acidity volatile.acidity citric.acid
## Min. : 1.0 Min. : 4.60 Min. :0.1200 Min. :0.000
## 1st Qu.: 400.5 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090
## Median : 800.0 Median : 7.90 Median :0.5200 Median :0.260
## Mean : 800.0 Mean : 8.32 Mean :0.5278 Mean :0.271
## 3rd Qu.:1199.5 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420
## Max. :1599.0 Max. :15.90 Max. :1.5800 Max. :1.000
## residual.sugar chlorides free.sulfur.dioxide
## Min. : 0.900 Min. :0.01200 Min. : 1.00
## 1st Qu.: 1.900 1st Qu.:0.07000 1st Qu.: 7.00
## Median : 2.200 Median :0.07900 Median :14.00
## Mean : 2.539 Mean :0.08747 Mean :15.87
## 3rd Qu.: 2.600 3rd Qu.:0.09000 3rd Qu.:21.00
## Max. :15.500 Max. :0.61100 Max. :72.00
## total.sulfur.dioxide density pH sulphates
## Min. : 6.00 Min. :0.9901 Min. :2.740 Min. :0.3300
## 1st Qu.: 22.00 1st Qu.:0.9956 1st Qu.:3.210 1st Qu.:0.5500
## Median : 38.00 Median :0.9968 Median :3.310 Median :0.6200
## Mean : 46.47 Mean :0.9967 Mean :3.311 Mean :0.6581
## 3rd Qu.: 62.00 3rd Qu.:0.9978 3rd Qu.:3.400 3rd Qu.:0.7300
## Max. :289.00 Max. :1.0037 Max. :4.010 Max. :2.0000
## alcohol quality
## Min. : 8.40 Min. :3.000
## 1st Qu.: 9.50 1st Qu.:5.000
## Median :10.20 Median :6.000
## Mean :10.42 Mean :5.636
## 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :14.90 Max. :8.000
class(rw$quality)
## [1] "integer"
I’d like to classify a quality variable into a larger bin, such as (4-5]; (5-6]; (7-8]. It makes analysis more deeper and specific. Lets create new categorical variable called rw$quality.rank with values “Low”, “Middle” and “High”.
rw$quality.rank <- cut(rw$quality, c(3,4,6,10), labels = c("low","middle","high"), include.lowest = T)
summary(rw$quality.rank)
## low middle high
## 63 1319 217
#convert quality variable into categorical variable
#rw$quality <- integer(rw$quality)
ggplot( data = rw, aes(x = quality)) +
geom_bar(color="white", fill="#3366FF")+
ggtitle('Quality distribution')+
xlab('Quality')+
theme(plot.title = element_text(size=12))
plot_fix_acid = plot_hist_one_var(t = "Fixed acidity distribution",
x_str = "fixed.acidity",
xl = "Fixed acidity (g / dm^3)")+
coord_cartesian(xlim=c(quantile(rw$fixed.acidity, 0.01),
quantile(rw$fixed.acidity, 0.99)))
plot_vol_acid = plot_hist_one_var(x_str = "volatile.acidity",
t = "Volatile acidity distribution",
xl = "Volatile acidity (g / dm^3)") +
coord_cartesian(xlim=c(0,1))
plot_cit_acid = plot_hist_one_var(x_str = "citric.acid",
t = "Citric acidity distribution",
xl = "Citric acidity (g / dm^3)") +
coord_cartesian(xlim=c(0,0.75))
plot_res_sugar = plot_hist_one_var(x_str = "residual.sugar",
t = "Residual sugar distribution",
xl = "Residual sugar (g / dm^3)") +
scale_x_continuous(breaks = seq(1,5,0.5))+
coord_cartesian(xlim=c(1,5))
plot_chlorides = plot_hist_one_var(x_str = "chlorides",
t = "Chlorides distribution",
xl = "Chlorides (g / dm^3)") +
coord_cartesian(xlim=c(0,0.2))
plot_so2 = plot_hist_one_var(x_str = "total.sulfur.dioxide",
t = "Total sulfur dioxide distribution",
xl = "Total sulfur dioxide (mg / dm^3)") +
coord_cartesian(xlim=c(0,150))
plot_ph = plot_hist_one_var(x_str = "pH",
t = "pH distribution",
xl = "pH") +
scale_x_continuous(breaks = seq(0,4,0.2))+
coord_cartesian(xlim=c(2.8,3.8))
plot_alcohol = plot_hist_one_var(x_str = "alcohol",
t = "Alcohol distributionn",
xl = "Alcohol (% by volume)")
grid.arrange(plot_fix_acid,plot_vol_acid,plot_cit_acid,plot_res_sugar,ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
grid.arrange(plot_chlorides,plot_so2,plot_ph,plot_alcohol,ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
correlate(rw)
##
## CORRELATIONS
## ============
## - correlation type: pearson
## - correlations shown only when both variables are numeric
##
## X fixed.acidity volatile.acidity citric.acid
## X . -0.268 -0.009 -0.154
## fixed.acidity -0.268 . -0.256 0.672
## volatile.acidity -0.009 -0.256 . -0.552
## citric.acid -0.154 0.672 -0.552 .
## residual.sugar -0.031 0.115 0.002 0.144
## chlorides -0.120 0.094 0.061 0.204
## free.sulfur.dioxide 0.090 -0.154 -0.011 -0.061
## total.sulfur.dioxide -0.118 -0.113 0.076 0.036
## density -0.368 0.668 0.022 0.365
## pH 0.136 -0.683 0.235 -0.542
## sulphates -0.125 0.183 -0.261 0.313
## alcohol 0.245 -0.062 -0.202 0.110
## quality 0.066 0.124 -0.391 0.226
## quality.rank . . . .
## residual.sugar chlorides free.sulfur.dioxide
## X -0.031 -0.120 0.090
## fixed.acidity 0.115 0.094 -0.154
## volatile.acidity 0.002 0.061 -0.011
## citric.acid 0.144 0.204 -0.061
## residual.sugar . 0.056 0.187
## chlorides 0.056 . 0.006
## free.sulfur.dioxide 0.187 0.006 .
## total.sulfur.dioxide 0.203 0.047 0.668
## density 0.355 0.201 -0.022
## pH -0.086 -0.265 0.070
## sulphates 0.006 0.371 0.052
## alcohol 0.042 -0.221 -0.069
## quality 0.014 -0.129 -0.051
## quality.rank . . .
## total.sulfur.dioxide density pH sulphates alcohol
## X -0.118 -0.368 0.136 -0.125 0.245
## fixed.acidity -0.113 0.668 -0.683 0.183 -0.062
## volatile.acidity 0.076 0.022 0.235 -0.261 -0.202
## citric.acid 0.036 0.365 -0.542 0.313 0.110
## residual.sugar 0.203 0.355 -0.086 0.006 0.042
## chlorides 0.047 0.201 -0.265 0.371 -0.221
## free.sulfur.dioxide 0.668 -0.022 0.070 0.052 -0.069
## total.sulfur.dioxide . 0.071 -0.066 0.043 -0.206
## density 0.071 . -0.342 0.149 -0.496
## pH -0.066 -0.342 . -0.197 0.206
## sulphates 0.043 0.149 -0.197 . 0.094
## alcohol -0.206 -0.496 0.206 0.094 .
## quality -0.185 -0.175 -0.058 0.251 0.476
## quality.rank . . . . .
## quality quality.rank
## X 0.066 .
## fixed.acidity 0.124 .
## volatile.acidity -0.391 .
## citric.acid 0.226 .
## residual.sugar 0.014 .
## chlorides -0.129 .
## free.sulfur.dioxide -0.051 .
## total.sulfur.dioxide -0.185 .
## density -0.175 .
## pH -0.058 .
## sulphates 0.251 .
## alcohol 0.476 .
## quality . .
## quality.rank . .
#craete new dataframe without categorical vars
numeric_vars <- names(rw) %in% c("quality.rank", "quality")
rw_num <- rw[!numeric_vars]
ctab <- cor(rw_num)
#plot which show correlation in more convenient way
colorfun <- colorRamp(c("#CC0000","white","#3366CC"), space="Lab")
plotcorr(ctab, mar = c(0,0,0,0), col=rgb(colorfun((ctab+1)/2),
maxColorValue=255))
#set.seed(278812)
#rw_samp <- rw[sample(1:length(rw$quality), 1000), ]
#ggpairs(rw_samp, axisLabels = "internal",
# params = c(shape = I('.'), outlier.shape = I('.')),
# diag = list(prams = c(size = 1)))
box_plot_func(y_str = "citric.acid",
yl = "Citric acid",
t = "Citric acid by quality rank")
Quality wines have higher level of Citric acid. At the same time max value not bigger than 1.0 and 3rd quartile equal to 0.42. Outliers almost absent.
box_plot_func(y_str = "total.sulfur.dioxide",
yl = "Total SO2",
t = "Total SO2 by quality rank")
No conclusions from Total SO2 distribution. Maybe, the reason is the values lie under the obvious limits in most of observations.
box_plot_func(y_str = "residual.sugar",
yl = "Residual sugar",
t = "Residual sugar by quality rank")
Set limits and zoom the plot
box_plot_func(y_str = "residual.sugar",
yl = "Residual sugar",
t = "Residual sugar by quality rank")+
coord_cartesian(ylim=c(1.5,3))
by(rw$residual.sugar, rw$quality.rank, summary)
## rw$quality.rank: low
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.200 1.900 2.100 2.685 2.950 12.900
## --------------------------------------------------------
## rw$quality.rank: middle
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.900 1.900 2.200 2.504 2.600 15.500
## --------------------------------------------------------
## rw$quality.rank: high
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.200 2.000 2.300 2.709 2.700 8.900
Residual sugar median have bigger values for quality wines. At the same time it has a huge dispersion, especially for poor wines. In this case 1st Qu. = 1.9 which lower than the “middle” and “high” rank of wines has. The 3rd Qu. = 2.95 which bigger than the other wines. Sugar max value is 15.5 so, there’s no sweet wines in this dataset (as we know, to be sweet wine should consist more than 45 g/m^3 sugar)
box_plot_func(y_str = "pH",
yl = "pH",
t = "pH by quality rank")+
coord_cartesian(ylim=c(3.1,3.6))
by(rw$pH, rw$quality.rank, summary)
## rw$quality.rank: low
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.740 3.300 3.380 3.384 3.500 3.900
## --------------------------------------------------------
## rw$quality.rank: middle
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.860 3.210 3.310 3.311 3.400 4.010
## --------------------------------------------------------
## rw$quality.rank: high
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.880 3.200 3.270 3.289 3.380 3.780
pH has higher values for poor wines.
box_plot_func(y_str = "alcohol",
yl = "Alcohol",
t = "Alcohol by quality rank")+
coord_cartesian(ylim=c(9,13))
by(rw$alcohol, rw$quality.rank, summary)
## rw$quality.rank: low
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.40 9.60 10.00 10.22 11.00 13.10
## --------------------------------------------------------
## rw$quality.rank: middle
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.40 9.50 10.00 10.25 10.90 14.90
## --------------------------------------------------------
## rw$quality.rank: high
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.20 10.80 11.60 11.52 12.20 14.00
Significant difference of alcohol value between “high” quality wines and the others. At the same time, wine with “low” and “middle” quality rank have distribution and median pretty similar.
box_plot_func(y_str = "sulphates",
yl = "Sulphates",
t = "Sulphates by quality rank")
Set limits and zoom the plot
box_plot_func(y_str = "sulphates",
yl = "Sulphates",
t = "Sulphates by quality rank")+
coord_cartesian(ylim=c(0.4,0.9))
by(rw$sulphates, rw$quality.rank, summary)
## rw$quality.rank: low
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3300 0.4950 0.5600 0.5922 0.6000 2.0000
## --------------------------------------------------------
## rw$quality.rank: middle
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3700 0.5400 0.6100 0.6473 0.7000 1.9800
## --------------------------------------------------------
## rw$quality.rank: high
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3900 0.6500 0.7400 0.7435 0.8200 1.3600
There are unexpected result a little bit. Quality rank depends on sulphates value. I’d assumed sulphates makes wine worst, but in this case it isn’t confirmed. Examine this value later.
box_plot_func(y_str = "chlorides",
yl = "Chlorides",
t = "Chlorides by quality rank")+
coord_cartesian(ylim=c(0.05,0.1))
Chlorides have many outliers.
Seems that wines quality depend on the following characteristics:
Let’s examine these variables deeper.
point_plot_func( x_str = "pH",
y_str = "chlorides",
xl = "pH",
yl = "Chorides",
t = "Chlorides vs pH by Quality rank",
rvrs = F)+
coord_cartesian(ylim=c(0,0.2))
No conclusions from this plot
point_plot_func( x_str = "pH",
y_str = "sulphates",
xl = "pH",
yl = "Sulphates",
t = "Sulphates vs pH by Quality rank")
It looks like wines with higher sulphates value will be more quality with the equal level of pH. One more evidence of sulphates influence on quality.
point_plot_func( x_str = "pH",
y_str = "citric.acid",
xl = "pH",
yl = "Citric acid",
t = "Citric acid vs pH by Quality rank")
Most of HQ wines lie above 0.25 of Citric acid value
point_plot_func( x_str = "pH",
y_str = "volatile.acidity",
xl = "pH",
yl = "Volatile acidity",
t = "Volatile acidity vs pH by Quality rank",
rvrs = F)
Looks like wines with lower volatile acidity value will be of higher quality with the equal level of pH.
point_plot_func( x_str = "total.sulfur.dioxide",
y_str = "sulphates",
xl = "Total SO2",
yl = "Sulphates",
t = "Sulphates vs Total SO2 by Quality rank")+
coord_cartesian(ylim=c(0,1.5), xlim=c(0,100))
Another confirmation of sulphates influence. Seems like quality wines mostly consists no more than 50g of total SO2.
point_plot_func( x_str = "volatile.acidity",
y_str = "sulphates",
xl = "Volatile acidity",
yl = "Sulphates",
t = "Sulphates vs Volatile acidity by Quality rank")
Quality wines lies under the 0.4 volatile acidity. At the same time high quality wines has a big dispersion of sulphates than the other wines.
point_plot_func( x_str = "alcohol",
y_str = "sulphates",
xl = "Alcohol",
yl = "Sulphates",
t = "Sulphates vs Alcohol by Quality rank")
Another interesting plot. All quality wines placed in top right corner with higher level of sulfates and alcohol.
point_plot_func( x_str = "alcohol",
y_str = "density",
xl = "Alcohol",
yl = "density",
t = "Density vs alcohol by Quality rank")
Looks like density reduced in dependence on alcohol level growth
point_plot_func( x_str = "residual.sugar",
y_str = "density",
xl = "Residual sugar",
yl = "Density",
t = "Density vs Residual sugar Quality rank")+
coord_cartesian(xlim=c(1,4))
No conclusions from this plot
ggplot( data = rw, aes(x=alcohol, y = residual.sugar, color=density)) +
geom_point(size=3, alpha=0.5, position="jitter")+
xlab("Alcohol")+
ylab("Residual sugar")+
ggtitle("Residual sugar vs Alcohol by Density")+
coord_cartesian(ylim=c(1,4))
No conclusions from this plot
point_plot_func( x_str = "alcohol",
y_str = "residual.sugar",
yl = "Residual sugar",
xl = "Alcohol",
t = "Residual sugar vs Alcohol by Quality rank")+
coord_cartesian(ylim=c(1,4))
No conclusions from this plot
point_plot_func( x_str = "alcohol",
y_str = "citric.acid",
yl = "Citric acid",
xl = "Alcohol",
t = "Citric acid vs Alcohol by Quality rank")+
coord_cartesian(ylim=c(0.05,0.15))
No conclusions from this plot
point_plot_func( x_str = "alcohol",
y_str = "volatile.acidity",
yl = "Volatile acidity",
xl = "Alcohol",
t = "Volatile acidity vs alcohol by Quality rank")
There are confirmed thesis wines more quality with lower volatile acidity level.
#craete subset with only HQ rank wines
hqrw = subset(rw, quality.rank == "high")
ggplot()+
geom_point(data = hqrw, aes(x=volatile.acidity,
y = citric.acid),
color = "#3366FF",
size=3)+
ylab("Citric acid")+
xlab("Volatle acidity")+
ggtitle("Citric acid vs Volatile acidity for high quality rank wines")+
scale_x_continuous(breaks = seq(0,1.6,0.2))+
scale_y_continuous(breaks = seq(0,0.75,0.25))+
geom_rect(aes(xmin=0.4,xmax=Inf,ymin=0,ymax=0.25),alpha=0.2,fill="red")
This plot show mostly of HQ wines placed in zone with high level of citric acid (>0.25) and low level of volatile acidity (<0.4). However, there are some HQ wines which break this rule. They placed in red square on the plot.
I’d like to compare these two data sets and try to figure out which parameters are distinguished and which characteristics contribute to save high quality of wine even in case of high volatile acidity and low citric acid. Create new categorical variable in new data set. It’ll be defined whether wine lie in “red zone” or not. Let’s call it hqrw$is.outliers
hqrw$is.outliers <- ifelse(hqrw$citric.acid<0.25 & hqrw$volatile.acidity>0.4,
"y", "n")
Let’s look at our variables
density_plot_func(d = hqrw,
col = "is.outliers",
x_str = "pH",
xl = "pH",
t = "Density of pH for high quality rank wines")
by(hqrw$pH, hqrw$is.outliers, summary)
## hqrw$is.outliers: n
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.880 3.170 3.240 3.241 3.320 3.780
## --------------------------------------------------------
## hqrw$is.outliers: y
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.320 3.370 3.460 3.476 3.560 3.720
pH level significantly differ between these two groups. 1st Qu. of “red zones” values equal to 3.37. It’s higher than the 3rd Qu. (3.32) of the other part of data set. Before I noticed worse wines has higher pH value. In this case HQ wines have higher level of pH. So, it’s very interesting finding.
Let’s look on the other variables.
HQ_sulph_density = density_plot_func(d = hqrw,
col = "is.outliers",
x_str = "sulphates",
xl = "Sulphates",
t = "Sulphates density by HQ wine group")+
theme(plot.title = element_text(size=10))
HQ_totalso_density = density_plot_func(d = hqrw,
col = "is.outliers",
x_str = "total.sulfur.dioxide",
xl = "Total SO2",
t = "Total SO2 density by HQ wine group")+
theme(plot.title = element_text(size=10))
HQ_freeso_density = density_plot_func(d = hqrw,
col = "is.outliers",
x_str = "free.sulfur.dioxide",
xl = "Free SO2",
t = "Free SO2 density by HQ wine group")+
theme(plot.title = element_text(size=10))
HQ_chlorides_density = density_plot_func(d = hqrw,
col = "is.outliers",
x_str = "chlorides",
xl = "Chlorides",
t = "Chlorides density by HQ wine group")+
theme(plot.title = element_text(size=10))
grid.arrange(HQ_chlorides_density,
HQ_freeso_density,
HQ_totalso_density,
HQ_sulph_density,
ncol=2)
by(hqrw$total.sulfur.dioxide, hqrw$is.outliers, summary)
## hqrw$is.outliers: n
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.00 16.00 25.00 33.31 42.00 289.00
## --------------------------------------------------------
## hqrw$is.outliers: y
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11.00 18.00 33.00 41.11 49.25 106.00
by(hqrw$free.sulfur.dioxide, hqrw$is.outliers, summary)
## hqrw$is.outliers: n
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.00 6.00 10.00 13.73 18.00 54.00
## --------------------------------------------------------
## hqrw$is.outliers: y
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.00 10.00 13.50 14.98 17.25 37.00
by(hqrw$chlorides, hqrw$is.outliers, summary)
## hqrw$is.outliers: n
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01200 0.06300 0.07400 0.07714 0.08500 0.35800
## --------------------------------------------------------
## hqrw$is.outliers: y
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.04200 0.05400 0.06500 0.07109 0.08700 0.14300
final_plot_one_1 = density_plot_func(col = "quality.rank",
x_str = "alcohol",
xl = "Alcohol",
t = "Alcohol density\n by quality rank")+
theme(legend.justification=c(1,1),
legend.position=c(1,1))
final_plot_one_2 = point_plot_func( x_str = "alcohol",
y_str = "volatile.acidity",
yl = "Volatile acidity",
xl = "Alcohol",
t = "Volatile acidity vs alcohol\n by quality rank",
rvrs = F)+
theme(legend.justification=c(1,1),
legend.position=c(1,1))
grid.arrange(final_plot_one_1, final_plot_one_2, ncol=2)
HQ wines contain more alcohol than the other. However, we shouldn’t forget about acidity. In case of wines with alcohol more than 10% HQ wines placed in zone with low volatile acidity.
final_plot_two_1 = ggplot( data = rw,
aes(x=quality.rank,
y = sulphates,
fill = quality.rank)) +
geom_boxplot()+
xlab("Quality rank")+
ylab("Sulphates (g / dm^3)")+
ggtitle("Sulphates by quality rank")+
coord_cartesian(ylim=c(0.4,0.9))+
theme(legend.justification=c(1,0),
legend.position=c(1,0))
final_plot_two_2 = point_plot_func( x_str = "pH",
y_str = "sulphates",
yl = "Sulphates",
xl = "pH",
t = "Sulphates vs pH by quality rank")+
coord_cartesian(ylim=c(0.2,1.2))+
theme(legend.justification=c(1,0),
legend.position=c(1,0))
grid.arrange(final_plot_two_1, final_plot_two_2, ncol=2)
Quality rank depends on sulphates value and this is unexpected result a little bit . If we look at sulphates vs pH level, we will find HQ wines contain more sulphates at the same level of pH.
final_plot_tree_1 = ggplot()+
geom_point(alpha = 0.5,
data = rw,
aes(x=volatile.acidity,
y = citric.acid,
color = quality.rank),
size=2)+
scale_x_continuous(breaks = seq(0,1.6,0.2))+
ylab("Citric acid")+
xlab("Volatle acidity")+
ggtitle("Citric acid vs Volatile acidity by quality rank")+
scale_color_brewer(palette="Spectral",
type = 'div',
guide = guide_legend(title = 'Quality rank',
reverse = T))+
geom_rect(aes(xmin=0.4,xmax=Inf,ymin=0,ymax=0.25),alpha=0.1,fill="red") +
theme(plot.title = element_text(size=12),
legend.justification=c(1,1),
legend.position=c(1,1))
final_plot_tree_2 = density_plot_func(d = hqrw,
col = "is.outliers",
x_str = "pH",
xl = "pH",
t = "Density of pH for HQ rank wines")+
theme(plot.title = element_text(size=12),
legend.justification=c(1,1),
legend.position=c(1,1))
grid.arrange(final_plot_tree_1, final_plot_tree_2, ncol=2)
HQ wines mainly have low level of volatile acidity and citric acid level between 0.25 and 0.75. However, some samples of HQ wines has different parameters.
Additional research of HQ wines subset and its outliers show interesting dependencies. If wines have high level of quality even in “red zone”, then pH value significantly increase. To reduce influence of volatile acidity on quality need to increase quantity of antioxidants. As a result we receive higher value of pH for HQ wines with high level of acidity. Antioxidants may be such data set parameters as Chlorides, Total and Free SO2 and Sulphates.
Three final plots demonstrate dependance wines quality from the several characteristics. As a result, the same quality wine level may have different proportions of SO2, sulphates, pH, acidity and alcohol.
Main findings and trends of red wine data set:
Another interesting thing is poor wines has a high pH level, but in the third plot we saw opposite evidence. I think pH level makes balance to provide good taste for wine.
For example, we have pretty similar values of pH for the different wine quality. in main data set for poor wines: Min. 1st Qu. Median Mean 3rd Qu. Max. 2.740 3.300 3.380 3.384 3.500 3.900
in HQ wines subset for “outliers”: Min. 1st Qu. Median Mean 3rd Qu. Max. 3.320 3.370 3.460 3.476 3.560 3.720
So, we can’t examine wine quality based only on the single parameter.
It was interesting to explore this data set. I’m not expert of wines. I even don’t drink. So, it was fascinatingly to determine which characteristics make wines good or poor without tasting :)
When I used different plot types I received more complex information about the same variable. (e.g. boxplot show different details rather than histogram or density plot). I never used boxplots before. So, it’s very useful tool to describe variable.
Exploring subset of HQ wines and outliers in the end of data analysis discovered some interesting dependencies. After that, I realized how parameters contribute to each other.
During analysis I faced such difficulties as:
Moreover, there are several tricky moment in the data set:
In this data set I didn’t find such features like age of wine, grape, soil acidity etc. Feasibly, one of them could show interesting correlations.
Possible future researches:
EDA really exciting and may take a huge time to research. You should to know time to stop.