library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
wine_data <- read.csv("data/winequality-red.csv", sep = ";")
head(wine_data)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.4 0.70 0.00 1.9 0.076
## 2 7.8 0.88 0.00 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.70 0.00 1.9 0.076
## 6 7.4 0.66 0.00 1.8 0.075
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.20 0.68 9.8
## 3 15 54 0.9970 3.26 0.65 9.8
## 4 17 60 0.9980 3.16 0.58 9.8
## 5 11 34 0.9978 3.51 0.56 9.4
## 6 13 40 0.9978 3.51 0.56 9.4
## quality
## 1 5
## 2 5
## 3 5
## 4 6
## 5 5
## 6 5
#Q1 Sample Size
print(paste(nrow(wine_data)))
## [1] "1599"
#Q2 Any outliers? Do you have any concerns about the data quality?
There are multiple outliers as seen in the summary statistics, that the central tendency values (mean and median) have a huge difference from the max or min values in multiple cases.
For example, Max value in case of fixed.acidity is 15.90, whereas the mean value is 8.32, and the 3rd quadrant value is limit at 9.20, showing that the max value is way outside of the 3rd quadrant.
Primary concerns around data would be, defining the extent to which values outside 95 percentile are to be considered and how do we deal with such values, should we eliminate such entries or those are rare occurrences that needs to be addressed.
Also, looking at the difference between the men and median, it shows that the data provided is heavily skewed, is this expected?
Another concern would be that in quality column, there is no value in range 4-4.5, 5-5.5, 6-6.5, and 7-7.5. Is this expected?
#Q3 Summary Statistics, The statistics ti display are mean, median, max, min, std. deviation, and variance
summary(wine_data)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956
## Median :0.07900 Median :14.00 Median : 38.00 Median :0.9968
## Mean :0.08747 Mean :15.87 Mean : 46.47 Mean :0.9967
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :72.00 Max. :289.00 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.740 Min. :0.3300 Min. : 8.40 Min. :3.000
## 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.310 Median :0.6200 Median :10.20 Median :6.000
## Mean :3.311 Mean :0.6581 Mean :10.42 Mean :5.636
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :8.000
#stddev <- sd(wine_data$Column_Name) - One can mention the column name for which they want to calculate std deviation.
#variance <- var(wine_data$Column_Name) - One can mention the column name for which they want to calculate variance.
#Q4 In Order to Visualize data, histogram and boxplot are arguable the best ways, as histogram helps us to understand the degree of skewness and box plots help us too see the number of outliers in each dataset. #One example is attached, the rest can be created by modifying the dummy code.
hist(wine_data$fixed.acidity)
boxplot(wine_data$fixed.acidity)
#Dummy Code
#hist(wine_data$Column_Name)
#boxplot(wine_data$Column_Name)
#Q5 Skewness of data
#1 Fixed.Acidity - Data is skewed to right
hist(wine_data$fixed.acidity)
boxplot(wine_data$fixed.acidity)
#2 volatile.acidity - Few number of outliers and follows normal curve.
hist(wine_data$volatile.acidity)
boxplot(wine_data$volatile.acidity)
#3 citric.acid - Data is heavily skewed towards right
hist(wine_data$citric.acid)
boxplot(wine_data$citric.acid)
#4 residual.sugar - Data is skewed to right
hist(wine_data$residual.sugar)
boxplot(wine_data$residual.sugar)
#5 chlorides - Data is skewed to right, too many outliers
hist(wine_data$chlorides)
boxplot(wine_data$chlorides)
#6 free.sulfur.dioxide - Data is skewed to right, too many outliers
hist(wine_data$free.sulfur.dioxide)
boxplot(wine_data$free.sulfur.dioxide)
#7 total.sulfur.dioxide - Skewed to right without too many outliers
hist(wine_data$total.sulfur.dioxide)
boxplot(wine_data$total.sulfur.dioxide)
#8 density - Follows Normal Distribution Curve, 1-2 outliers
hist(wine_data$density)
boxplot(wine_data$density)
#9 pH - Follows Normal Distribution Curve, 1-2 outliers
hist(wine_data$pH)
boxplot(wine_data$pH)
#10 sulphates - Skewed to right, multiple outliers
hist(wine_data$sulphates)
boxplot(wine_data$sulphates)
#11 alcohol - Skewed to right, 1-2 outliers
hist(wine_data$alcohol)
boxplot(wine_data$alcohol)
#12 quality - Missing values in Quality, but it follows normal distribition curve.
hist(wine_data$quality)
boxplot(wine_data$quality)