Installing Packages install.packages(c(“package A”, “package X”))
##Use 'install.packages(c("library A", "library X"))' to newly install these libraries
library(tidyverse)
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
[37m-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.3.0 --[39m
[37m[32mv[37m [34mggplot2[37m 3.3.0 [32mv[37m [34mpurrr [37m 0.3.3
[32mv[37m [34mtibble [37m 2.1.3 [32mv[37m [34mdplyr [37m 0.8.5
[32mv[37m [34mtidyr [37m 1.0.2 [32mv[37m [34mstringr[37m 1.4.0
[32mv[37m [34mreadr [37m 1.3.1 [32mv[37m [34mforcats[37m 0.5.0[39m
[37m-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[37m [34mdplyr[37m::[32mfilter()[37m masks [34mstats[37m::filter()
[31mx[37m [34mdplyr[37m::[32mlag()[37m masks [34mstats[37m::lag()[39m
library(ggplot2)
library(corrplot)
corrplot 0.84 loaded
red_wine =read.csv('C:/Users/HP/Documents/EDA/R/Wine/winequality-red.csv', sep=";")
white_wine = read.csv('C:/Users/HP/Documents/EDA/R/Wine/winequality-white.csv', sep = ";")
##Note the 'sep' argument to state that the data is delimited not by a comma, but by a';'
head(red_wine)
glimpse(red_wine)
summary(red_wine)
white_wine.Before we go into Data Visualisation we can actually merge our datasets into one, to save us the Duplicity of tasks.
red_wine$style <- "red"
white_wine$style <- "white"
wine <- full_join(red_wine, white_wine)
Joining, by = c("fixed.acidity", "volatile.acidity", "citric.acid", "residual.sugar", "chlorides", "free.sulfur.dioxide", "total.sulfur.dioxide", "density", "pH", "sulphates", "alcohol", "quality", "style")
##Missing Value
sum(is.na(wine))
[1] 0
glimpse(wine)
Observations: 6,497
Variables: 13
$ fixed.acidity [3m[38;5;246m<dbl>[39m[23m 7.4, 7.8, 7.8, 11.2, 7.4, 7.4, 7.9, 7.3, 7.8, 7.5, 6.7, 7.5, 5.6, 7....
$ volatile.acidity [3m[38;5;246m<dbl>[39m[23m 0.700, 0.880, 0.760, 0.280, 0.700, 0.660, 0.600, 0.650, 0.580, 0.500...
$ citric.acid [3m[38;5;246m<dbl>[39m[23m 0.00, 0.00, 0.04, 0.56, 0.00, 0.00, 0.06, 0.00, 0.02, 0.36, 0.08, 0....
$ residual.sugar [3m[38;5;246m<dbl>[39m[23m 1.9, 2.6, 2.3, 1.9, 1.9, 1.8, 1.6, 1.2, 2.0, 6.1, 1.8, 6.1, 1.6, 1.6...
$ chlorides [3m[38;5;246m<dbl>[39m[23m 0.076, 0.098, 0.092, 0.075, 0.076, 0.075, 0.069, 0.065, 0.073, 0.071...
$ free.sulfur.dioxide [3m[38;5;246m<dbl>[39m[23m 11, 25, 15, 17, 11, 13, 15, 15, 9, 17, 15, 17, 16, 9, 52, 51, 35, 16...
$ total.sulfur.dioxide [3m[38;5;246m<dbl>[39m[23m 34, 67, 54, 60, 34, 40, 59, 21, 18, 102, 65, 102, 59, 29, 145, 148, ...
$ density [3m[38;5;246m<dbl>[39m[23m 0.9978, 0.9968, 0.9970, 0.9980, 0.9978, 0.9978, 0.9964, 0.9946, 0.99...
$ pH [3m[38;5;246m<dbl>[39m[23m 3.51, 3.20, 3.26, 3.16, 3.51, 3.51, 3.30, 3.39, 3.36, 3.35, 3.28, 3....
$ sulphates [3m[38;5;246m<dbl>[39m[23m 0.56, 0.68, 0.65, 0.58, 0.56, 0.56, 0.46, 0.47, 0.57, 0.80, 0.54, 0....
$ alcohol [3m[38;5;246m<dbl>[39m[23m 9.4, 9.8, 9.8, 9.8, 9.4, 9.4, 9.4, 10.0, 9.5, 10.5, 9.2, 10.5, 9.9, ...
$ quality [3m[38;5;246m<int>[39m[23m 5, 5, 5, 6, 5, 5, 5, 7, 7, 5, 5, 5, 5, 5, 5, 5, 7, 5, 4, 6, 6, 5, 5,...
$ style [3m[38;5;246m<chr>[39m[23m "red", "red", "red", "red", "red", "red", "red", "red", "red", "red"...
We see that wine$quality is an Integer Value and from the documentation describes the ‘quality’ of wine from a scale of 1-10, well lets transform this into three Categories of ‘low’, ‘medium’ and ‘high’.
wine$qualityBucket <- cut(wine$quality, ##Cut this Variable
c(1, 4, 6, 10), ##From these provided ranges
labels = c("low", "Medium", "High")) ##Into these respective ranges
ggplot(data = wine) +
geom_bar(mapping = aes(x = qualityBucket))+
facet_wrap(~style, ncol = 1)
Let see Alcohol distribution by wine.
wine %>% ggplot(aes(x = alcohol)) +
geom_histogram(binwidth = .1) +
facet_wrap(~style, ncol = 1)
##BiVariate Relationship We are primarily assessing the relationship with wine quality, but you can adopt this for any other feature
with(subset(wine, style == 'red'), by(alcohol, qualityBucket, summary))
qualityBucket: low
Min. 1st Qu. Median Mean 3rd Qu. Max.
8.40 9.60 10.00 10.22 11.00 13.10
----------------------------------------------------------------------------
qualityBucket: Medium
Min. 1st Qu. Median Mean 3rd Qu. Max.
8.40 9.50 10.00 10.25 10.90 14.90
----------------------------------------------------------------------------
qualityBucket: High
Min. 1st Qu. Median Mean 3rd Qu. Max.
9.20 10.80 11.60 11.52 12.20 14.00
wine %>%
ggplot(aes(x = alcohol, y =quality)) +
geom_point(alpha =1/4) +
stat_smooth(method = "lm")+
facet_wrap(~style, ncol = 1)
wine %>%
ggplot(aes(x = alcohol, y =quality)) +
geom_point(alpha =1/4) +
stat_smooth(method = "lm")+
facet_wrap(~style, ncol = 1)
cor_Red <- cor(subset(wine,
style == "red",
select = (-c(qualityBucket, style))),
method = "pearson")
corrplot.mixed(abs(cor_Red))
cor_Red
cor_White <- cor(subset(wine,
style == "white",
select = (-c(qualityBucket, style))),
method = "pearson")
corrplot.mixed(abs(cor_White))
cor_White
fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
fixed.acidity 1.00000000 -0.02269729 0.289180698 0.08902070 0.02308564
volatile.acidity -0.02269729 1.00000000 -0.149471811 0.06428606 0.07051157
citric.acid 0.28918070 -0.14947181 1.000000000 0.09421162 0.11436445
residual.sugar 0.08902070 0.06428606 0.094211624 1.00000000 0.08868454
chlorides 0.02308564 0.07051157 0.114364448 0.08868454 1.00000000
free.sulfur.dioxide -0.04939586 -0.09701194 0.094077221 0.29909835 0.10139235
total.sulfur.dioxide 0.09106976 0.08926050 0.121130798 0.40143931 0.19891030
density 0.26533101 0.02711385 0.149502571 0.83896645 0.25721132
pH -0.42585829 -0.03191537 -0.163748211 -0.19413345 -0.09043946
sulphates -0.01714299 -0.03572815 0.062330940 -0.02666437 0.01676288
alcohol -0.12088112 0.06771794 -0.075728730 -0.45063122 -0.36018871
quality -0.11366283 -0.19472297 -0.009209091 -0.09757683 -0.20993441
free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
fixed.acidity -0.0493958591 0.091069756 0.26533101 -0.4258582910 -0.01714299
volatile.acidity -0.0970119393 0.089260504 0.02711385 -0.0319153683 -0.03572815
citric.acid 0.0940772210 0.121130798 0.14950257 -0.1637482114 0.06233094
residual.sugar 0.2990983537 0.401439311 0.83896645 -0.1941334540 -0.02666437
chlorides 0.1013923521 0.198910300 0.25721132 -0.0904394560 0.01676288
free.sulfur.dioxide 1.0000000000 0.615500965 0.29421041 -0.0006177961 0.05921725
total.sulfur.dioxide 0.6155009650 1.000000000 0.52988132 0.0023209718 0.13456237
density 0.2942104109 0.529881324 1.00000000 -0.0935914935 0.07449315
pH -0.0006177961 0.002320972 -0.09359149 1.0000000000 0.15595150
sulphates 0.0592172458 0.134562367 0.07449315 0.1559514973 1.00000000
alcohol -0.2501039415 -0.448892102 -0.78013762 0.1214320987 -0.01743277
quality 0.0081580671 -0.174737218 -0.30712331 0.0994272457 0.05367788
alcohol quality
fixed.acidity -0.12088112 -0.113662831
volatile.acidity 0.06771794 -0.194722969
citric.acid -0.07572873 -0.009209091
residual.sugar -0.45063122 -0.097576829
chlorides -0.36018871 -0.209934411
free.sulfur.dioxide -0.25010394 0.008158067
total.sulfur.dioxide -0.44889210 -0.174737218
density -0.78013762 -0.307123313
pH 0.12143210 0.099427246
sulphates -0.01743277 0.053677877
alcohol 1.00000000 0.435574715
quality 0.43557472 1.000000000