library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
wine_data <- read.csv("data/winequality-red.csv", sep = ";")
head(wine_data)
##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1           7.4             0.70        0.00            1.9     0.076
## 2           7.8             0.88        0.00            2.6     0.098
## 3           7.8             0.76        0.04            2.3     0.092
## 4          11.2             0.28        0.56            1.9     0.075
## 5           7.4             0.70        0.00            1.9     0.076
## 6           7.4             0.66        0.00            1.8     0.075
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                  11                   34  0.9978 3.51      0.56     9.4
## 2                  25                   67  0.9968 3.20      0.68     9.8
## 3                  15                   54  0.9970 3.26      0.65     9.8
## 4                  17                   60  0.9980 3.16      0.58     9.8
## 5                  11                   34  0.9978 3.51      0.56     9.4
## 6                  13                   40  0.9978 3.51      0.56     9.4
##   quality
## 1       5
## 2       5
## 3       5
## 4       6
## 5       5
## 6       5

#Q1 Sample Size

print(paste(nrow(wine_data)))
## [1] "1599"

#Q2 Any outliers? Do you have any concerns about the data quality?

There are multiple outliers as seen in the summary statistics, that the central tendency values (mean and median) have a huge difference from the max or min values in multiple cases.

For example, Max value in case of fixed.acidity is 15.90, whereas the mean value is 8.32, and the 3rd quadrant value is limit at 9.20, showing that the max value is way outside of the 3rd quadrant.

Primary concerns around data would be, defining the extent to which values outside 95 percentile are to be considered and how do we deal with such values, should we eliminate such entries or those are rare occurrences that needs to be addressed.

Also, looking at the difference between the men and median, it shows that the data provided is heavily skewed, is this expected?

Another concern would be that in quality column, there is no value in range 4-4.5, 5-5.5, 6-6.5, and 7-7.5. Is this expected?

#Q3 Summary Statistics, The statistics ti display are mean, median, max, min, std. deviation, and variance

summary(wine_data)
##  fixed.acidity   volatile.acidity  citric.acid    residual.sugar  
##  Min.   : 4.60   Min.   :0.1200   Min.   :0.000   Min.   : 0.900  
##  1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090   1st Qu.: 1.900  
##  Median : 7.90   Median :0.5200   Median :0.260   Median : 2.200  
##  Mean   : 8.32   Mean   :0.5278   Mean   :0.271   Mean   : 2.539  
##  3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420   3rd Qu.: 2.600  
##  Max.   :15.90   Max.   :1.5800   Max.   :1.000   Max.   :15.500  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.00       Min.   :0.9901  
##  1st Qu.:0.07000   1st Qu.: 7.00       1st Qu.: 22.00       1st Qu.:0.9956  
##  Median :0.07900   Median :14.00       Median : 38.00       Median :0.9968  
##  Mean   :0.08747   Mean   :15.87       Mean   : 46.47       Mean   :0.9967  
##  3rd Qu.:0.09000   3rd Qu.:21.00       3rd Qu.: 62.00       3rd Qu.:0.9978  
##  Max.   :0.61100   Max.   :72.00       Max.   :289.00       Max.   :1.0037  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.740   Min.   :0.3300   Min.   : 8.40   Min.   :3.000  
##  1st Qu.:3.210   1st Qu.:0.5500   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.310   Median :0.6200   Median :10.20   Median :6.000  
##  Mean   :3.311   Mean   :0.6581   Mean   :10.42   Mean   :5.636  
##  3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90   Max.   :8.000
#stddev <- sd(wine_data$Column_Name) - One can mention the column name for which they want to calculate std deviation.
#variance <- var(wine_data$Column_Name) - One can mention the column name for which they want to calculate variance.

#Q4 In Order to Visualize data, histogram and boxplot are arguable the best ways, as histogram helps us to understand the degree of skewness and box plots help us too see the number of outliers in each dataset. #One example is attached, the rest can be created by modifying the dummy code.

hist(wine_data$fixed.acidity)

boxplot(wine_data$fixed.acidity)

#Dummy Code
#hist(wine_data$Column_Name)
#boxplot(wine_data$Column_Name)

#Q5 Skewness of data

#1 Fixed.Acidity - Data is skewed to right
hist(wine_data$fixed.acidity)

boxplot(wine_data$fixed.acidity)

#2 volatile.acidity - Few number of outliers and follows normal curve.
hist(wine_data$volatile.acidity)

boxplot(wine_data$volatile.acidity)

#3 citric.acid - Data is heavily skewed towards right
hist(wine_data$citric.acid)

boxplot(wine_data$citric.acid)

#4 residual.sugar - Data  is skewed to right
hist(wine_data$residual.sugar)

boxplot(wine_data$residual.sugar)

#5 chlorides - Data  is skewed to right, too many outliers
hist(wine_data$chlorides)

boxplot(wine_data$chlorides)

#6 free.sulfur.dioxide - Data  is skewed to right, too many outliers
hist(wine_data$free.sulfur.dioxide)

boxplot(wine_data$free.sulfur.dioxide)

#7 total.sulfur.dioxide - Skewed to right without too many outliers
hist(wine_data$total.sulfur.dioxide)

boxplot(wine_data$total.sulfur.dioxide)

#8 density - Follows Normal Distribution Curve, 1-2 outliers
hist(wine_data$density)

boxplot(wine_data$density)

#9 pH - Follows Normal Distribution Curve, 1-2 outliers
hist(wine_data$pH)

boxplot(wine_data$pH)

#10 sulphates - Skewed to right, multiple outliers
hist(wine_data$sulphates)

boxplot(wine_data$sulphates)

#11 alcohol - Skewed to right, 1-2 outliers
hist(wine_data$alcohol)

boxplot(wine_data$alcohol)

#12 quality - Missing values in Quality, but it  follows normal distribition curve.
hist(wine_data$quality)

boxplot(wine_data$quality)