Wine_Project

knitr::opts_chunk$set(echo = TRUE)
getwd()

## [1] "C:/Users/Pallavi/Desktop/MSBA Aug22/Stat Methods/Wine_Project/Wine_Project"

Winequality_Red<-read.csv("C:/Users/Pallavi/Desktop/MSBA Aug22/Stat Methods/Wine_Project/winequality-red.csv")
Winequality_Red<-tibble::as_tibble(Winequality_Red)
Winequality_Red

## # A tibble: 1,599 × 12
##    fixed…¹ volat…² citri…³ resid…⁴ chlor…⁵ free.…⁶ total…⁷ density    pH sulph…⁸
##      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <dbl>   <dbl>
##  1     7.4    0.7     0        1.9   0.076      11      34   0.998  3.51    0.56
##  2     7.8    0.88    0        2.6   0.098      25      67   0.997  3.2     0.68
##  3     7.8    0.76    0.04     2.3   0.092      15      54   0.997  3.26    0.65
##  4    11.2    0.28    0.56     1.9   0.075      17      60   0.998  3.16    0.58
##  5     7.4    0.7     0        1.9   0.076      11      34   0.998  3.51    0.56
##  6     7.4    0.66    0        1.8   0.075      13      40   0.998  3.51    0.56
##  7     7.9    0.6     0.06     1.6   0.069      15      59   0.996  3.3     0.46
##  8     7.3    0.65    0        1.2   0.065      15      21   0.995  3.39    0.47
##  9     7.8    0.58    0.02     2     0.073       9      18   0.997  3.36    0.57
## 10     7.5    0.5     0.36     6.1   0.071      17     102   0.998  3.35    0.8 
## # … with 1,589 more rows, 2 more variables: alcohol <dbl>, quality <int>, and
## #   abbreviated variable names ¹fixed.acidity, ²volatile.acidity, ³citric.acid,
## #   ⁴residual.sugar, ⁵chlorides, ⁶free.sulfur.dioxide, ⁷total.sulfur.dioxide,
## #   ⁸sulphates

attach(Winequality_Red)

library(ggplot2)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ✔ purrr   0.3.4      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(dplyr)
library(MASS)

## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select

#***The sample size of data is 1599***

# Now we will summarize the data of each variable in a concise way.

summary(Winequality_Red)

##  fixed.acidity   volatile.acidity  citric.acid    residual.sugar  
##  Min.   : 4.60   Min.   :0.1200   Min.   :0.000   Min.   : 0.900  
##  1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090   1st Qu.: 1.900  
##  Median : 7.90   Median :0.5200   Median :0.260   Median : 2.200  
##  Mean   : 8.32   Mean   :0.5278   Mean   :0.271   Mean   : 2.539  
##  3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420   3rd Qu.: 2.600  
##  Max.   :15.90   Max.   :1.5800   Max.   :1.000   Max.   :15.500  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.00       Min.   :0.9901  
##  1st Qu.:0.07000   1st Qu.: 7.00       1st Qu.: 22.00       1st Qu.:0.9956  
##  Median :0.07900   Median :14.00       Median : 38.00       Median :0.9968  
##  Mean   :0.08747   Mean   :15.87       Mean   : 46.47       Mean   :0.9967  
##  3rd Qu.:0.09000   3rd Qu.:21.00       3rd Qu.: 62.00       3rd Qu.:0.9978  
##  Max.   :0.61100   Max.   :72.00       Max.   :289.00       Max.   :1.0037  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.740   Min.   :0.3300   Min.   : 8.40   Min.   :3.000  
##  1st Qu.:3.210   1st Qu.:0.5500   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.310   Median :0.6200   Median :10.20   Median :6.000  
##  Mean   :3.311   Mean   :0.6581   Mean   :10.42   Mean   :5.636  
##  3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90   Max.   :8.000

#**Study of data of each variable**
#(1)**Fixed acidity
summary(fixed.acidity)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.60    7.10    7.90    8.32    9.20   15.90

quantile(Winequality_Red$fixed.acidity,p=c(0.1,0.25,0.5,0.75,0.9))

##  10%  25%  50%  75%  90% 
##  6.5  7.1  7.9  9.2 10.7

# From summary and quantile it is clear that 50% of data lies around 7.9 and 90% lies around 10.7 , that shows max.value of 15.90 is outlier.

#So now we will find outlier by histogram. 


hist(x = fixed.acidity, freq = FALSE, xlim = c(0, 15))


#From Histograme it is clear that data is right side skewed.  

# And probability distribution curve confirms it.
lines(x = density(x = fixed.acidity), col = "red",lwd=5)

# To again find out the nos.of outlier we will draw boxplot.

boxplot(fixed.acidity)

#From boxplot it is clear that outliers can be seen post 14 nos.

#(2)**Volatile acidity
summary(volatile.acidity)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1200  0.3900  0.5200  0.5278  0.6400  1.5800

# from Summary it is clear that there is not much of the difference bw mean and median so it might be normal distribution.

hist(volatile.acidity,freq=FALSE)
# from histograme its clear that data does not look very much skewed.so we need to check further. 

lines(x = density(x = volatile.acidity), col = "red",lwd=5)

# So post looking at probability distribution curve variable seems to be slightly skewed towards right side.so need to further investigate to understand the presence of outlier.

#lets check the outlier from boxplot.
boxplot(volatile.acidity)

# from boxplot is clear that there might not be many but there seem to be few outliers with the value of 1.58 acidity. 

#(3)**Citric acid
summary(citric.acid)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.090   0.260   0.271   0.420   1.000

# From the summary data it seems mean and median are quite close so we need to investigate further.

hist(citric.acid,freq=FALSE)
lines(x = density(x = citric.acid), col = "red",lwd=5)

# from histrogram skewness/outlier of data is not very clear. so let's check by boxplot.

boxplot(citric.acid)

# so from boxplot it seems there is only 1 extreme value,which could be or could not be an outlier and might be just human error. 

#(4)**residual sugar
summary(residual.sugar)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.900   1.900   2.200   2.539   2.600  15.500

# it is evident from summary data the distence bw 3rd qu and max difference is 12.9. So there is strong presence of outliers.

# Lets check the data spread through Confidence interval  

sd(residual.sugar)  # 1.40

## [1] 1.409928

var(residual.sugar) # 1.98

## [1] 1.987897

mean(residual.sugar)-2*sd(residual.sugar) # calculating CI -.281

## [1] -0.2810506

mean(residual.sugar)+2*sd(residual.sugar) # calculating CI +5.35

## [1] 5.358662

sum((residual.sugar>-0.281 & residual.sugar < 5.35)==TRUE)/1599

## [1] 0.9530957

#95.3% data lies in range of -0.281 to +5.35 and rest 5% is out of this range.

# Since it is clear that it is not normal distribution so let's check it's skeweness wheather left or right by histograme and probability distribution line.

hist(residual.sugar,freq=FALSE)
lines(x = density(x = residual.sugar), col = "red",lwd=5)

# from probability distribution curve it is clear that it is right side skwed.

# Now let's further check outliers position through boxplot.

boxplot(residual.sugar)

table(residual.sugar)

## residual.sugar
##  0.9  1.2  1.3  1.4  1.5  1.6 1.65  1.7 1.75  1.8  1.9    2 2.05  2.1 2.15  2.2 
##    2    8    5   35   30   58    2   76    2  129  117  156    2  128    2  131 
## 2.25  2.3 2.35  2.4  2.5 2.55  2.6 2.65  2.7  2.8 2.85  2.9 2.95    3  3.1  3.2 
##    1  109    1   86   84    1   79    1   39   49    1   24    1   25    7   15 
##  3.3  3.4 3.45  3.5  3.6 3.65  3.7 3.75  3.8  3.9    4  4.1  4.2 4.25  4.3  4.4 
##   11   15    1    2    8    1    4    1    8    6   11    6    5    1    8    4 
##  4.5  4.6 4.65  4.7  4.8    5  5.1 5.15  5.2  5.4  5.5  5.6  5.7  5.8  5.9    6 
##    4    6    2    1    3    1    5    1    3    1    8    6    1    4    3    4 
##  6.1  6.2  6.3  6.4 6.55  6.6  6.7    7  7.2  7.3  7.5  7.8  7.9  8.1  8.3  8.6 
##    4    3    2    3    2    2    2    1    1    1    1    2    3    2    3    1 
##  8.8  8.9    9 10.7   11 12.9 13.4 13.8 13.9 15.4 15.5 
##    2    1    1    1    2    1    1    2    1    2    1

# from boxplot it is clear that there is outliers and its around 7 count.This is a significant count of outliers and need to be treated.

#(5)**Chlorides
summary(chlorides)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01200 0.07000 0.07900 0.08747 0.09000 0.61100

# from summary it seems the difference bw 3rd quadrent and max is 0.52,which means there is a possiblity of presence of outliers.

hist(chlorides,freq=FALSE)

# from histograme it is clear that data is right side skwed and outliers are also present.so let's again check presence of outlier by boxplot.

lines(x = density(x = chlorides), col = "red",lwd=5)

boxplot(chlorides)

# from boxplot it is clear that very few outliers are present and might need be looked at throughly before utilizing this data.

#(6)**free sulfur dioxide 
summary(free.sulfur.dioxide)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    7.00   14.00   15.87   21.00   72.00

#Although in first instance it seems the quality of data looks good,as there is not much difference in mean and median,but after checking the distance bw 3rd qua and max value,which is more then 4 times, which clearly confirms presence of outliers.

# so lets find outliers

sd(free.sulfur.dioxide)  # 10.44

## [1] 10.46016

var(free.sulfur.dioxide) # 109.41

## [1] 109.4149

mean(free.sulfur.dioxide)-2*sd(free.sulfur.dioxide) # calculating CI -5.04

## [1] -5.045392

mean(free.sulfur.dioxide)+2*sd(free.sulfur.dioxide) # calculating CI +36.79

## [1] 36.79524

sum((free.sulfur.dioxide>-5.04 & free.sulfur.dioxide < 36.79)==TRUE)/1599

## [1] 0.9587242

#95.8% confident that data lies in range of -5.04 to +36.79 and rest 4.2% is out of this range.That reconfirms presence of outliers.
# so lets see presence of outlier through histogram.

hist(free.sulfur.dioxide)

# and yes the data is right side skwed and not a standard normal distribution.
#Since data is discrete so it shall be represneted by frequency count.
# so let's find outliers through boxplot.

boxplot(free.sulfur.dioxide)

# from boxplot it is clear that outlier is present and that might be data capturing issue.
table(free.sulfur.dioxide)

## free.sulfur.dioxide
##    1    2    3    4    5  5.5    6    7    8    9   10   11   12   13   14   15 
##    3    1   49   41  104    1  138   71   56   62   79   59   75   57   50   78 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31 
##   61   60   46   39   30   41   22   32   34   24   32   29   23   23   16   20 
##   32   33   34   35   36   37 37.5   38   39   40 40.5   41   42   43   45   46 
##   22   11   18   15   11    3    2    9    5    6    1    7    3    3    3    1 
##   47   48   50   51   52   53   54   55   57   66   68   72 
##    1    4    2    4    3    1    1    2    1    1    2    1

# from boxplot and table it is clear that more than value 60, 4 data points seems to outliers.

#(7)** Total sulfer dioxide
summary(total.sulfur.dioxide)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6.00   22.00   38.00   46.47   62.00  289.00

#There is a difference in mean and median with 8.47,that shows data is not normally distributed.also distence bw 3rd quadrent(62) and max limit(289) is 4.66 times.So it seems outlier is present in data.
# Lets further investigate through histograme.

hist(total.sulfur.dioxide)

#Since data is discrete so it shall be represneted by frequency count.
# from histograme it is clear that data is right side skwed.
# now lets check outliers 

boxplot(total.sulfur.dioxide)

# from boxplot it is evident that outlier is present,and needs to be further investigated/treated.


#(8)**Density**
summary(density)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.9901  0.9956  0.9968  0.9967  0.9978  1.0037

# summary data shows mean and median are same and distence bw 3rd quadrent and max value is 0.01. That shows data is normally distributed.

# lets check data distribution through histograme.

hist(density,freq = FALSE)
lines(x = density(x = density), col = "red",lwd=5)

# and yes from histograme and probability distribution curve it is clear that data is normally distributed.

#(9)** ph
summary(pH)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.740   3.210   3.310   3.311   3.400   4.010

# Data summary shows that mean and median are equal so data quality seems good at first sight.The distence b/w 3rd qudrant and max is also 0.6 so need to be further investigate.
# lets check it with histograme.

hist(pH,freq = FALSE)
# Histograme shows that the data is normally distributed.and data is good for study.Although it shows presence of few outlier at the extreme right side of graph,which is unclear.

lines(x = density(x =pH), col = "red",lwd=5)

# so let's check with boxplot.
boxplot(pH)

# from boxplot 1 data point seems to be little distent but it still does not confirms presence of the outlier.

# so we will further check it with IQR calculation.

IQR = 3.400-3.210
IQR

## [1] 0.19

Outlier_upper<-3.400+1.5*IQR
Outlier_upper   # 3.685

## [1] 3.685

Outlier_lower<-3.210-1.5*IQR
Outlier_lower   # 2.925

## [1] 2.925

# so now outlier would be below 2.925 or more than 3.685 
# so now we will check the same by table funciton.

table(pH)

## pH
## 2.74 2.86 2.87 2.88 2.89  2.9 2.92 2.93 2.94 2.95 2.98 2.99    3 3.01 3.02 3.03 
##    1    1    1    2    4    1    4    3    4    1    5    2    6    5    8    6 
## 3.04 3.05 3.06 3.07 3.08 3.09  3.1 3.11 3.12 3.13 3.14 3.15 3.16 3.17 3.18 3.19 
##   10    8   10   11   11   11   19    9   20   13   21   34   36   27   30   25 
##  3.2 3.21 3.22 3.23 3.24 3.25 3.26 3.27 3.28 3.29  3.3 3.31 3.32 3.33 3.34 3.35 
##   39   36   39   32   29   26   53   35   42   46   57   39   45   37   43   39 
## 3.36 3.37 3.38 3.39  3.4 3.41 3.42 3.43 3.44 3.45 3.46 3.47 3.48 3.49  3.5 3.51 
##   56   37   48   48   37   34   33   17   29   20   22   21   19   10   14   15 
## 3.52 3.53 3.54 3.55 3.56 3.57 3.58 3.59  3.6 3.61 3.62 3.63 3.66 3.67 3.68 3.69 
##   18   17   16    8   11   10   10    8    7    8    4    3    4    3    5    4 
##  3.7 3.71 3.72 3.74 3.75 3.78 3.85  3.9 4.01 
##    1    4    3    1    1    2    1    2    2

#so from table it is clear outler nos.are 21 on upper side and 22 nos are lower side. 


#(10)**sulphates
summary(sulphates)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3300  0.5500  0.6200  0.6581  0.7300  2.0000

#from data summary it shows mean and median are almost same so not sure about presnce of outliers.
#so lets further check it through histograme.

hist(sulphates,freq = FALSE)
lines(x = density(x = sulphates), col = "red",lwd=5)

# histogram shows that graph is not normally distributed and slightly skewed on right hand side.
#also from and probability distribution curve it is evident that data is right side skewed.
# so lets check presence of outlier through boxplot.

boxplot(sulphates)

# boxplot shows presence of outliers,which needs to be accounted for before further utilization of data.

#(11)**alcohal**
summary(alcohol)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.40    9.50   10.20   10.42   11.10   14.90

#from data summary it shows mean and median are almost same so not sure about presnce of outliers.
#so lets further check it through histograme.

hist(alcohol,freq = FALSE)
lines(x = density(x =alcohol), col = "red",lwd=5)

# from histograme it is visible that data is skewed towards right hand side.but there is still no sign of presence of outlier as data is connected throughout.

# so it seems that outlier is not present.and no issue seems in data quality.
# so will further investigate through boxplot.

boxplot(alcohol)

# boxplot shows presence of outliers,which needs to be accounted for before further utilization of data.

#(12)**quality 
summary(quality)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000   6.000   5.636   6.000   8.000

# data type is discrete numerical. summary shows that mean and median having difference of 0.4. so there might be possiblity of presnce of outliers. The distence b/w 3rd qudrant and max is also 2.0 so there might be presence of outlier.
# lets further check with histograme.

hist(quality)

#from histograme it seems that there is gap in data and not connected to each other.so can not comment on outlier from histograme.

# so lets further check it through boxplot to check presence of outliers.

boxplot(quality)

# from boxplot it seems presence of very few outliers.

Summary

Q1 : Total sample size : 1599
Q2 : Outliers : My interpretation listed in below table.

also Concerns about data quality are described in output file with respect to each variable.
Q3 : summary function has been used to describe each variable in concise way.

Statistics used to explain variable - Mean,Median,IQR,Percentile,CI,SD,VAR
Q4 : Histograme with probability distribution curve in case of continuous variable and histograme with frequency in case of discrete variable is used. For outlier presence boxplot has been used.
Q5 : Skewness with respect to individual variable is described in output file.

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Wine_Project_Final

Akash Kalambe

2022-09-04

Summary