knitr::opts_chunk$set(echo = TRUE)
getwd()
## [1] "C:/Users/Pallavi/Desktop/MSBA Aug22/Stat Methods/Wine_Project/Wine_Project"
Winequality_Red<-read.csv("C:/Users/Pallavi/Desktop/MSBA Aug22/Stat Methods/Wine_Project/winequality-red.csv")
Winequality_Red<-tibble::as_tibble(Winequality_Red)
Winequality_Red
## # A tibble: 1,599 × 12
## fixed…¹ volat…² citri…³ resid…⁴ chlor…⁵ free.…⁶ total…⁷ density pH sulph…⁸
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.4 0.7 0 1.9 0.076 11 34 0.998 3.51 0.56
## 2 7.8 0.88 0 2.6 0.098 25 67 0.997 3.2 0.68
## 3 7.8 0.76 0.04 2.3 0.092 15 54 0.997 3.26 0.65
## 4 11.2 0.28 0.56 1.9 0.075 17 60 0.998 3.16 0.58
## 5 7.4 0.7 0 1.9 0.076 11 34 0.998 3.51 0.56
## 6 7.4 0.66 0 1.8 0.075 13 40 0.998 3.51 0.56
## 7 7.9 0.6 0.06 1.6 0.069 15 59 0.996 3.3 0.46
## 8 7.3 0.65 0 1.2 0.065 15 21 0.995 3.39 0.47
## 9 7.8 0.58 0.02 2 0.073 9 18 0.997 3.36 0.57
## 10 7.5 0.5 0.36 6.1 0.071 17 102 0.998 3.35 0.8
## # … with 1,589 more rows, 2 more variables: alcohol <dbl>, quality <int>, and
## # abbreviated variable names ¹fixed.acidity, ²volatile.acidity, ³citric.acid,
## # ⁴residual.sugar, ⁵chlorides, ⁶free.sulfur.dioxide, ⁷total.sulfur.dioxide,
## # ⁸sulphates
attach(Winequality_Red)
library(ggplot2)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
#***The sample size of data is 1599***
# Now we will summarize the data of each variable in a concise way.
summary(Winequality_Red)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956
## Median :0.07900 Median :14.00 Median : 38.00 Median :0.9968
## Mean :0.08747 Mean :15.87 Mean : 46.47 Mean :0.9967
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :72.00 Max. :289.00 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.740 Min. :0.3300 Min. : 8.40 Min. :3.000
## 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.310 Median :0.6200 Median :10.20 Median :6.000
## Mean :3.311 Mean :0.6581 Mean :10.42 Mean :5.636
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :8.000
#**Study of data of each variable**
#(1)**Fixed acidity
summary(fixed.acidity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.60 7.10 7.90 8.32 9.20 15.90
quantile(Winequality_Red$fixed.acidity,p=c(0.1,0.25,0.5,0.75,0.9))
## 10% 25% 50% 75% 90%
## 6.5 7.1 7.9 9.2 10.7
# From summary and quantile it is clear that 50% of data lies around 7.9 and 90% lies around 10.7 , that shows max.value of 15.90 is outlier.
#So now we will find outlier by histogram.
hist(x = fixed.acidity, freq = FALSE, xlim = c(0, 15))
#From Histograme it is clear that data is right side skewed.
# And probability distribution curve confirms it.
lines(x = density(x = fixed.acidity), col = "red",lwd=5)
# To again find out the nos.of outlier we will draw boxplot.
boxplot(fixed.acidity)
#From boxplot it is clear that outliers can be seen post 14 nos.
#(2)**Volatile acidity
summary(volatile.acidity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1200 0.3900 0.5200 0.5278 0.6400 1.5800
# from Summary it is clear that there is not much of the difference bw mean and median so it might be normal distribution.
hist(volatile.acidity,freq=FALSE)
# from histograme its clear that data does not look very much skewed.so we need to check further.
lines(x = density(x = volatile.acidity), col = "red",lwd=5)
# So post looking at probability distribution curve variable seems to be slightly skewed towards right side.so need to further investigate to understand the presence of outlier.
#lets check the outlier from boxplot.
boxplot(volatile.acidity)
# from boxplot is clear that there might not be many but there seem to be few outliers with the value of 1.58 acidity.
#(3)**Citric acid
summary(citric.acid)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.090 0.260 0.271 0.420 1.000
# From the summary data it seems mean and median are quite close so we need to investigate further.
hist(citric.acid,freq=FALSE)
lines(x = density(x = citric.acid), col = "red",lwd=5)
# from histrogram skewness/outlier of data is not very clear. so let's check by boxplot.
boxplot(citric.acid)
# so from boxplot it seems there is only 1 extreme value,which could be or could not be an outlier and might be just human error.
#(4)**residual sugar
summary(residual.sugar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.900 1.900 2.200 2.539 2.600 15.500
# it is evident from summary data the distence bw 3rd qu and max difference is 12.9. So there is strong presence of outliers.
# Lets check the data spread through Confidence interval
sd(residual.sugar) # 1.40
## [1] 1.409928
var(residual.sugar) # 1.98
## [1] 1.987897
mean(residual.sugar)-2*sd(residual.sugar) # calculating CI -.281
## [1] -0.2810506
mean(residual.sugar)+2*sd(residual.sugar) # calculating CI +5.35
## [1] 5.358662
sum((residual.sugar>-0.281 & residual.sugar < 5.35)==TRUE)/1599
## [1] 0.9530957
#95.3% data lies in range of -0.281 to +5.35 and rest 5% is out of this range.
# Since it is clear that it is not normal distribution so let's check it's skeweness wheather left or right by histograme and probability distribution line.
hist(residual.sugar,freq=FALSE)
lines(x = density(x = residual.sugar), col = "red",lwd=5)
# from probability distribution curve it is clear that it is right side skwed.
# Now let's further check outliers position through boxplot.
boxplot(residual.sugar)
table(residual.sugar)
## residual.sugar
## 0.9 1.2 1.3 1.4 1.5 1.6 1.65 1.7 1.75 1.8 1.9 2 2.05 2.1 2.15 2.2
## 2 8 5 35 30 58 2 76 2 129 117 156 2 128 2 131
## 2.25 2.3 2.35 2.4 2.5 2.55 2.6 2.65 2.7 2.8 2.85 2.9 2.95 3 3.1 3.2
## 1 109 1 86 84 1 79 1 39 49 1 24 1 25 7 15
## 3.3 3.4 3.45 3.5 3.6 3.65 3.7 3.75 3.8 3.9 4 4.1 4.2 4.25 4.3 4.4
## 11 15 1 2 8 1 4 1 8 6 11 6 5 1 8 4
## 4.5 4.6 4.65 4.7 4.8 5 5.1 5.15 5.2 5.4 5.5 5.6 5.7 5.8 5.9 6
## 4 6 2 1 3 1 5 1 3 1 8 6 1 4 3 4
## 6.1 6.2 6.3 6.4 6.55 6.6 6.7 7 7.2 7.3 7.5 7.8 7.9 8.1 8.3 8.6
## 4 3 2 3 2 2 2 1 1 1 1 2 3 2 3 1
## 8.8 8.9 9 10.7 11 12.9 13.4 13.8 13.9 15.4 15.5
## 2 1 1 1 2 1 1 2 1 2 1
# from boxplot it is clear that there is outliers and its around 7 count.This is a significant count of outliers and need to be treated.
#(5)**Chlorides
summary(chlorides)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01200 0.07000 0.07900 0.08747 0.09000 0.61100
# from summary it seems the difference bw 3rd quadrent and max is 0.52,which means there is a possiblity of presence of outliers.
hist(chlorides,freq=FALSE)
# from histograme it is clear that data is right side skwed and outliers are also present.so let's again check presence of outlier by boxplot.
lines(x = density(x = chlorides), col = "red",lwd=5)
boxplot(chlorides)
# from boxplot it is clear that very few outliers are present and might need be looked at throughly before utilizing this data.
#(6)**free sulfur dioxide
summary(free.sulfur.dioxide)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 7.00 14.00 15.87 21.00 72.00
#Although in first instance it seems the quality of data looks good,as there is not much difference in mean and median,but after checking the distance bw 3rd qua and max value,which is more then 4 times, which clearly confirms presence of outliers.
# so lets find outliers
sd(free.sulfur.dioxide) # 10.44
## [1] 10.46016
var(free.sulfur.dioxide) # 109.41
## [1] 109.4149
mean(free.sulfur.dioxide)-2*sd(free.sulfur.dioxide) # calculating CI -5.04
## [1] -5.045392
mean(free.sulfur.dioxide)+2*sd(free.sulfur.dioxide) # calculating CI +36.79
## [1] 36.79524
sum((free.sulfur.dioxide>-5.04 & free.sulfur.dioxide < 36.79)==TRUE)/1599
## [1] 0.9587242
#95.8% confident that data lies in range of -5.04 to +36.79 and rest 4.2% is out of this range.That reconfirms presence of outliers.
# so lets see presence of outlier through histogram.
hist(free.sulfur.dioxide)
# and yes the data is right side skwed and not a standard normal distribution.
#Since data is discrete so it shall be represneted by frequency count.
# so let's find outliers through boxplot.
boxplot(free.sulfur.dioxide)
# from boxplot it is clear that outlier is present and that might be data capturing issue.
table(free.sulfur.dioxide)
## free.sulfur.dioxide
## 1 2 3 4 5 5.5 6 7 8 9 10 11 12 13 14 15
## 3 1 49 41 104 1 138 71 56 62 79 59 75 57 50 78
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
## 61 60 46 39 30 41 22 32 34 24 32 29 23 23 16 20
## 32 33 34 35 36 37 37.5 38 39 40 40.5 41 42 43 45 46
## 22 11 18 15 11 3 2 9 5 6 1 7 3 3 3 1
## 47 48 50 51 52 53 54 55 57 66 68 72
## 1 4 2 4 3 1 1 2 1 1 2 1
# from boxplot and table it is clear that more than value 60, 4 data points seems to outliers.
#(7)** Total sulfer dioxide
summary(total.sulfur.dioxide)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.00 22.00 38.00 46.47 62.00 289.00
#There is a difference in mean and median with 8.47,that shows data is not normally distributed.also distence bw 3rd quadrent(62) and max limit(289) is 4.66 times.So it seems outlier is present in data.
# Lets further investigate through histograme.
hist(total.sulfur.dioxide)
#Since data is discrete so it shall be represneted by frequency count.
# from histograme it is clear that data is right side skwed.
# now lets check outliers
boxplot(total.sulfur.dioxide)
# from boxplot it is evident that outlier is present,and needs to be further investigated/treated.
#(8)**Density**
summary(density)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.9901 0.9956 0.9968 0.9967 0.9978 1.0037
# summary data shows mean and median are same and distence bw 3rd quadrent and max value is 0.01. That shows data is normally distributed.
# lets check data distribution through histograme.
hist(density,freq = FALSE)
lines(x = density(x = density), col = "red",lwd=5)
# and yes from histograme and probability distribution curve it is clear that data is normally distributed.
#(9)** ph
summary(pH)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.740 3.210 3.310 3.311 3.400 4.010
# Data summary shows that mean and median are equal so data quality seems good at first sight.The distence b/w 3rd qudrant and max is also 0.6 so need to be further investigate.
# lets check it with histograme.
hist(pH,freq = FALSE)
# Histograme shows that the data is normally distributed.and data is good for study.Although it shows presence of few outlier at the extreme right side of graph,which is unclear.
lines(x = density(x =pH), col = "red",lwd=5)
# so let's check with boxplot.
boxplot(pH)
# from boxplot 1 data point seems to be little distent but it still does not confirms presence of the outlier.
# so we will further check it with IQR calculation.
IQR = 3.400-3.210
IQR
## [1] 0.19
Outlier_upper<-3.400+1.5*IQR
Outlier_upper # 3.685
## [1] 3.685
Outlier_lower<-3.210-1.5*IQR
Outlier_lower # 2.925
## [1] 2.925
# so now outlier would be below 2.925 or more than 3.685
# so now we will check the same by table funciton.
table(pH)
## pH
## 2.74 2.86 2.87 2.88 2.89 2.9 2.92 2.93 2.94 2.95 2.98 2.99 3 3.01 3.02 3.03
## 1 1 1 2 4 1 4 3 4 1 5 2 6 5 8 6
## 3.04 3.05 3.06 3.07 3.08 3.09 3.1 3.11 3.12 3.13 3.14 3.15 3.16 3.17 3.18 3.19
## 10 8 10 11 11 11 19 9 20 13 21 34 36 27 30 25
## 3.2 3.21 3.22 3.23 3.24 3.25 3.26 3.27 3.28 3.29 3.3 3.31 3.32 3.33 3.34 3.35
## 39 36 39 32 29 26 53 35 42 46 57 39 45 37 43 39
## 3.36 3.37 3.38 3.39 3.4 3.41 3.42 3.43 3.44 3.45 3.46 3.47 3.48 3.49 3.5 3.51
## 56 37 48 48 37 34 33 17 29 20 22 21 19 10 14 15
## 3.52 3.53 3.54 3.55 3.56 3.57 3.58 3.59 3.6 3.61 3.62 3.63 3.66 3.67 3.68 3.69
## 18 17 16 8 11 10 10 8 7 8 4 3 4 3 5 4
## 3.7 3.71 3.72 3.74 3.75 3.78 3.85 3.9 4.01
## 1 4 3 1 1 2 1 2 2
#so from table it is clear outler nos.are 21 on upper side and 22 nos are lower side.
#(10)**sulphates
summary(sulphates)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3300 0.5500 0.6200 0.6581 0.7300 2.0000
#from data summary it shows mean and median are almost same so not sure about presnce of outliers.
#so lets further check it through histograme.
hist(sulphates,freq = FALSE)
lines(x = density(x = sulphates), col = "red",lwd=5)
# histogram shows that graph is not normally distributed and slightly skewed on right hand side.
#also from and probability distribution curve it is evident that data is right side skewed.
# so lets check presence of outlier through boxplot.
boxplot(sulphates)
# boxplot shows presence of outliers,which needs to be accounted for before further utilization of data.
#(11)**alcohal**
summary(alcohol)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.40 9.50 10.20 10.42 11.10 14.90
#from data summary it shows mean and median are almost same so not sure about presnce of outliers.
#so lets further check it through histograme.
hist(alcohol,freq = FALSE)
lines(x = density(x =alcohol), col = "red",lwd=5)
# from histograme it is visible that data is skewed towards right hand side.but there is still no sign of presence of outlier as data is connected throughout.
# so it seems that outlier is not present.and no issue seems in data quality.
# so will further investigate through boxplot.
boxplot(alcohol)
# boxplot shows presence of outliers,which needs to be accounted for before further utilization of data.
#(12)**quality
summary(quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.636 6.000 8.000
# data type is discrete numerical. summary shows that mean and median having difference of 0.4. so there might be possiblity of presnce of outliers. The distence b/w 3rd qudrant and max is also 2.0 so there might be presence of outlier.
# lets further check with histograme.
hist(quality)
#from histograme it seems that there is gap in data and not connected to each other.so can not comment on outlier from histograme.
# so lets further check it through boxplot to check presence of outliers.
boxplot(quality)
# from boxplot it seems presence of very few outliers.
Q1 : Total sample size : 1599
Q2 : Outliers : My interpretation listed in below table.
also Concerns about data quality are described in output file with respect to each variable.
Q3 : summary function has been used to describe each variable in concise way.
Statistics used to explain variable - Mean,Median,IQR,Percentile,CI,SD,VAR
Q4 : Histograme with probability distribution curve in case of continuous variable and histograme with frequency in case of discrete variable is used. For outlier presence boxplot has been used.
Q5 : Skewness with respect to individual variable is described in output file.
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.