Source of the data: https://archive.ics.uci.edu/ml/datasets/Wine+Quality
Attribute Information:
For more information, read [Cortez et al., 2009].
Input variables (based on physicochemical tests):
Output variable (based on sensory data):
library(ggplot2)
library(scales)
library(DT)
library(tidyr)
library(dplyr)
library(psych)
library(gridExtra)
library(GGally)
library(corrplot)
library(ggExtra)
library(ggcorrplot)
df=read.csv("winequality-red.csv",sep=";")
datatable(head(df,5))
#knitr::kable(head(df,5))
#the shape of data and data types
str(df)
## 'data.frame': 1599 obs. of 12 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
#check for missing values
#num of obs. - num of complete cases = num of imcomplete cases
nrow(df)-sum(complete.cases(df))
## [1] 0
#descriptive stats
describe(df)
## vars n mean sd median trimmed mad min
## fixed.acidity 1 1599 8.32 1.74 7.90 8.15 1.48 4.60
## volatile.acidity 2 1599 0.53 0.18 0.52 0.52 0.18 0.12
## citric.acid 3 1599 0.27 0.19 0.26 0.26 0.25 0.00
## residual.sugar 4 1599 2.54 1.41 2.20 2.26 0.44 0.90
## chlorides 5 1599 0.09 0.05 0.08 0.08 0.01 0.01
## free.sulfur.dioxide 6 1599 15.87 10.46 14.00 14.58 10.38 1.00
## total.sulfur.dioxide 7 1599 46.47 32.90 38.00 41.84 26.69 6.00
## density 8 1599 1.00 0.00 1.00 1.00 0.00 0.99
## pH 9 1599 3.31 0.15 3.31 3.31 0.15 2.74
## sulphates 10 1599 0.66 0.17 0.62 0.64 0.12 0.33
## alcohol 11 1599 10.42 1.07 10.20 10.31 1.04 8.40
## quality 12 1599 5.64 0.81 6.00 5.59 1.48 3.00
## max range skew kurtosis se
## fixed.acidity 15.90 11.30 0.98 1.12 0.04
## volatile.acidity 1.58 1.46 0.67 1.21 0.00
## citric.acid 1.00 1.00 0.32 -0.79 0.00
## residual.sugar 15.50 14.60 4.53 28.49 0.04
## chlorides 0.61 0.60 5.67 41.53 0.00
## free.sulfur.dioxide 72.00 71.00 1.25 2.01 0.26
## total.sulfur.dioxide 289.00 283.00 1.51 3.79 0.82
## density 1.00 0.01 0.07 0.92 0.00
## pH 4.01 1.27 0.19 0.80 0.00
## sulphates 2.00 1.67 2.42 11.66 0.00
## alcohol 14.90 6.50 0.86 0.19 0.03
## quality 8.00 5.00 0.22 0.29 0.02
The red wine dat has 1599 observations and 12 variables.
There is no missing value
All variables except quality are numeric, quality - the target variable - is an integer.
And based on the skew values, the residual sugar, chlorides, and sulphates are very skewed.
a=table(df$quality)
b=table(df$quality)/nrow(df)*100
c=rbind(a,b)
c=data.frame(c)
rownames(c)=c('Number of Wines','%')
colnames(c)=c(3,4,5,6,7,8)
c=round(c,digits = 2)
#knitr::kable(c,digits = 2)
datatable(c)
define variables for visualization
fill.color='#154360' #"#1579D2"
fill.color2="#D35400"
vline.color="#D0DBE5"
axis.line.color="#e5e7e9"
lab.color="#D68910"
all.theme=theme_bw()+
theme(
text = element_text(family="Courier"),#,size=16
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(color = axis.line.color),
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(family='Courier',vjust=0.6,hjust =0.2 ),
axis.text.y = element_text(family = 'Courier',vjust = 0.6,hjust = 0.2),
plot.subtitle = element_text(hjust=0.5)
)
Density vs Histogram
reference:
Density plot is also known as Kernel Density Plots, Density Trace Graph.
A Density Plot visualises the distribution of data over a continuous interval or time period. This chart is a variation of a Histogram that uses kernel smoothing to plot values, allowing for smoother distributions by smoothing out the noise. The peaks of a Density Plot help display where values are concentrated over the interval.
An advantage Density Plots have over Histograms is that they’re better at determining the distribution shape because they’re not affected by the number of bins used (each bar used in a typical histogram). A Histogram comprising of only 4 bins wouldn’t produce a distinguishable enough shape of distribution as a 20-bin Histogram would. However, with Density Plots, this isn’t an issue.
avg.cnt=round(mean(df$fixed.acidity),0)
g1=ggplot(df, aes(x=fixed.acidity))+
geom_density(fill=fill.color,color=fill.color)+
labs(
title="Density plot",
subtitle="The fixed acidity is slightly right skewed",
caption="Data source: UCI (download year 2018)",
x="Fixed acidity",
y="Density"
)+all.theme+
scale_x_continuous(labels = comma,breaks = seq(0,21,5))+
geom_vline(xintercept =avg.cnt,size=1,color=vline.color,linetype="dashed")+
geom_text(aes(label=comma(avg.cnt),x=avg.cnt+.2,y=.3),size=4,color=lab.color)
g2=ggplot(df,aes(volatile.acidity)) +
geom_density(fill=fill.color,color=fill.color)+all.theme
g3=ggplot(df,aes(citric.acid)) +
geom_density(fill=fill.color,color=fill.color)+all.theme
g4=ggplot(df,aes(residual.sugar)) +
geom_density(fill=fill.color,color=fill.color)+all.theme
g5=ggplot(df,aes(chlorides)) +
geom_density(fill=fill.color,color=fill.color)+all.theme
g6=ggplot(df,aes(free.sulfur.dioxide)) +
geom_density(fill=fill.color,color=fill.color)+all.theme
g7=ggplot(df,aes(total.sulfur.dioxide)) +
geom_density(fill=fill.color,color=fill.color)+all.theme
g8=ggplot(df,aes(density)) +
geom_density(fill=fill.color,color=fill.color)+all.theme
g9=ggplot(df,aes(pH)) +
geom_density(fill=fill.color,color=fill.color)+all.theme
g10=ggplot(df,aes(sulphates)) +
geom_density(fill=fill.color,color=fill.color)+all.theme
g11=ggplot(df,aes(alcohol)) +
geom_density(fill=fill.color,color=fill.color)+all.theme
grid.arrange(g1, g2, g3, g4,g5,g6,g7,g8,g9,g10,g11, ncol=3, nrow =4)
avg.cnt=round(mean(df$quality),0)
ggplot(df, aes(quality))+geom_bar(fill=fill.color,color=fill.color)+
labs(
title="bar chart of wine quality",
subtitle="The quality of wine follows normal distribution with the majority of wine with medium quality",
caption="Data source: UCI (download year 2018)",
x="quality",
y="count"
)+
all.theme
Scatterplot is also known as a Scatter Graph, Point Graph, X-Y Plot, Scatter Chart or Scattergram.
Scatterplots use a collection of points placed using Cartesian Coordinates to display values from two variables. By displaying a variable in each axis, you can detect if a relationship or correlation between the two variables exists.
Various types of correlation can be interpreted through the patterns displayed on Scatterplots. These are: - positive (values increase together), - negative (one value decreases as the other increases), - null (no correlation), - linear, exponential and U-shaped. The strength of the correlation can be determined by how closely packed the points are to each other on the graph. Points that end up far outside the general cluster of points are known as outliers.
Lines or curves are fitted within the graph to aid in analysis and are drawn as close to all the points as possible and to show how all the points were condensed into a single line would look. This is typically known as the Line of Best Fit or a Trend Line and can be used to make estimates via interpolation.
Scatterplots are ideal when you have paired numerical data and you want to see if one variable impacts the other. However, do remember that correlation is not causation and another unnoticed variable may be influencing results.
ggplot(df,aes(x=fixed.acidity,y=volatile.acidity))+
geom_point(fill=fill.color,color=fill.color)+
#geom_point(aes(color=as.factor(quality)))+
labs(
title="Scatterplot",
subtitle="fixed acidity vs. volatile acidity",
caption="Data source: UCI (download year 2018)",
x="fixed acidity",
y="volatile acidity"
)+all.theme+
scale_y_continuous(labels = comma,breaks = seq(0,2,.5))+
scale_x_continuous(breaks = seq(0,20,5))
g=ggplot(df,aes(x=fixed.acidity,y=volatile.acidity))+
geom_point(fill=fill.color,color=fill.color)+
geom_smooth(method="lm", se=F, color=fill.color2)+
labs(
title="Scatterplot",
subtitle="fixed acidity vs. volatile acidity",
caption="Data source: UCI (download year 2018)",
x="fixed acidity",
y="volatile acidity"
)+all.theme+
scale_y_continuous(labels = comma,breaks = seq(0,2,.5))+
scale_x_continuous(breaks = seq(0,20,5))
ggMarginal(g, type = "histogram", fill="transparent")
ggMarginal(g, type = "boxplot", fill="transparent")
g1=ggplot(df,aes(as.factor(quality),fixed.acidity)) +
geom_violin(aes(fill=quality))+
geom_boxplot(width = 0.1)+all.theme
g2=ggplot(df,aes(as.factor(quality),volatile.acidity)) +
geom_violin(aes(fill=quality))+
geom_boxplot(width = 0.1)+all.theme
g3=ggplot(df,aes(as.factor(quality),citric.acid)) +
geom_violin(aes(fill=quality))+
geom_boxplot(width = 0.1)+all.theme
g4=ggplot(df,aes(as.factor(quality),residual.sugar)) +
geom_violin(aes(fill=quality))+
geom_boxplot(width = 0.1)+all.theme
g5=ggplot(df,aes(as.factor(quality),chlorides)) +
geom_violin(aes(fill=quality))+
geom_boxplot(width = 0.1)+all.theme
g6=ggplot(df,aes(as.factor(quality),free.sulfur.dioxide)) +
geom_violin(aes(fill=quality))+
geom_boxplot(width = 0.1)+all.theme
g7=ggplot(df,aes(as.factor(quality),total.sulfur.dioxide)) +
geom_violin(aes(fill=quality))+
geom_boxplot(width = 0.1)+all.theme
g8=ggplot(df,aes(as.factor(quality),density)) +
geom_violin(aes(fill=quality))+
geom_boxplot(width = 0.1)+all.theme
g9=ggplot(df,aes(as.factor(quality),pH)) +
geom_violin(aes(fill=quality))+
geom_boxplot(width = 0.1)+all.theme
g10=ggplot(df,aes(as.factor(quality),sulphates)) +
geom_violin(aes(fill=quality))+
geom_boxplot(width = 0.1)+all.theme
g11=ggplot(df,aes(as.factor(quality),alcohol)) +
geom_violin(aes(fill=quality))+
geom_boxplot(width = 0.1)+all.theme
grid.arrange(g1, g2, g3, g4,g5,g6,g7,g8,g9,g10,g11, ncol=3, nrow =4)
ggpairs(df, columns=1:12, axisLabels="show",title='Red Wine Quality Data')+all.theme
corr=round(cor(df), 1)
# Plot
ggcorrplot(corr, hc.order = TRUE,
type = "lower",
lab = TRUE,
lab_size = 3,
method="circle",
colors = c("tomato2", "white", "springgreen3"),
title="Correlation of redwine variables")