Questions:

  1. Any outliers? Do you have any concerns about the data quality?
  2. How can you summarize the data of each variable in a concise way? What statistics are you going to present?
  3. Do you see any skewed distributions?

Observations:

  1. Quality has most values in the categories 5, 6 and 7.
  2. Fixed acidity, volatile acidity and citric acid have outliers.We can remove the outliers to get normal distributed data.
  3. Residual sugar has a right skewed distribution and the outliers are significant in number ~8%.
  4. free sulphur dioxide, density, have a few outliers. After removing them too it will be skwed.
  5. Alcohol is skewed from both right and left side.
RedWine<-data.table::fread(file = "/Users/uraj/Desktop/StatMethods/DataFile/winequality-red.csv", data.table = FALSE)

attach(RedWine)
# Question a
dim(RedWine)
## [1] 1599   12
summary(RedWine)
##  fixed_acidity   volatile_acidity  citric_acid    residual_sugar  
##  Min.   : 4.60   Min.   :0.1200   Min.   :0.000   Min.   : 0.900  
##  1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090   1st Qu.: 1.900  
##  Median : 7.90   Median :0.5200   Median :0.260   Median : 2.200  
##  Mean   : 8.32   Mean   :0.5278   Mean   :0.271   Mean   : 2.539  
##  3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420   3rd Qu.: 2.600  
##  Max.   :15.90   Max.   :1.5800   Max.   :1.000   Max.   :15.500  
##    chlorides       free_sulfur_dioxide total_sulfur_dioxide    density      
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.00       Min.   :0.9901  
##  1st Qu.:0.07000   1st Qu.: 7.00       1st Qu.: 22.00       1st Qu.:0.9956  
##  Median :0.07900   Median :14.00       Median : 38.00       Median :0.9968  
##  Mean   :0.08747   Mean   :15.87       Mean   : 46.47       Mean   :0.9967  
##  3rd Qu.:0.09000   3rd Qu.:21.00       3rd Qu.: 62.00       3rd Qu.:0.9978  
##  Max.   :0.61100   Max.   :72.00       Max.   :289.00       Max.   :1.0037  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.740   Min.   :0.3300   Min.   : 8.40   Min.   :3.000  
##  1st Qu.:3.210   1st Qu.:0.5500   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.310   Median :0.6200   Median :10.20   Median :6.000  
##  Mean   :3.311   Mean   :0.6581   Mean   :10.42   Mean   :5.636  
##  3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90   Max.   :8.000
# Question d:How can you visualize the distribution of each variable
boxplot(RedWine$fixed_acidity, col="slategray2", main=" Fixed Acidity")

boxplot(RedWine$volatile_acidity, col="slategray2", main=" Volatile Acidity")

boxplot(RedWine$citric_acid, col="slategray2", main="Citric Acid")

boxplot(RedWine$residual_sugar, col="slategray2", main="Residual Sugar Content")

boxplot(RedWine$chlorides, col="slategray2", main=" Chloride Content")

boxplot(RedWine$free_sulfur_dioxide, col="slategray2", main=" Free Sulpur Dioxide")

boxplot(RedWine$total_sulfur_dioxide, col="slategray2", main="Total Sulfur Dioxide")

boxplot(RedWine$density, col="slategray2", main="Density")

boxplot(RedWine$pH, col="slategray2", main="pH")

boxplot(RedWine$sulphates, col="slategray2", main="Sulphate Content")

boxplot(RedWine$alcohol, col="slategray2", main="AlcoholContent")

boxplot(RedWine$quality, col="slategray2", main=" Quality")

hist(RedWine$fixed_acidity,col='lightblue', labels = TRUE)

hist(RedWine$volatile_acidity,col='lightblue', labels = TRUE)

hist(RedWine$citric_acid,col='lightblue', labels = TRUE)

hist(RedWine$residual_sugar,col='lightblue', labels = TRUE)

hist(RedWine$chlorides,col='lightblue', labels = TRUE)

hist(RedWine$free_sulfur_dioxide,col='lightblue', labels = TRUE)

hist(RedWine$total_sulfur_dioxide,col='lightblue', labels = TRUE)

hist(RedWine$density,col='lightblue', labels = TRUE)

hist(RedWine$pH,col='lightblue', labels = TRUE)

hist(RedWine$sulphates,col='lightblue', labels = TRUE)

hist(RedWine$alcohol,col='lightblue', labels = TRUE)

hist(RedWine$quality,col='lightblue', labels = TRUE)