Introduction

In this report, I will be exploring data on both red and white wine in order to better understand their characteristics and quality factors. I will be trying to answer the following questions:

  1. What is the sample size?
  2. Any outliers? Do you have any concerns about the data quality?
  3. How can you summarize the data of each variable in a concise way? What statistics are you going to present?
  4. How can you visualize the distribution of each variable?
  5. Do you see any skewed distributions?


Sample Size

I will load in my data set first from the downloaded wine+quality zip file, and import any packages for the report.

# Load the dataset and packages
library(ggplot2)
library(dplyr)
library(tidyr)
library(psych)
library(patchwork)

red_df <- read.csv("wine+quality/winequality-red.csv", sep = ';')
white_df <- read.csv("wine+quality/winequality-white.csv", sep = ';')

To look at the sample size, I will be using nrow() to count the number of observations in both data sets. I also wanted to find out if there were any null values, so I will also be counting for any missing values.

# Counting Observations in Dataset
nrow(red_df)
## [1] 1599
nrow(white_df)
## [1] 4898
# Counting Null Values
sum(is.na(red_df))
## [1] 0
sum(is.na(white_df))
## [1] 0

The results show that the red and white wine data sets have 1599 and 4898 observations respectively with no missing values.


Outliers

For the outliers of both data sets, I found that the ‘residual.sugar’, ‘chlorides’, and ‘sulphates’ had noticeable outliers looking at a box plot. I have visualized the box plots individually in order for the outliers to be better seen at the scale of each variable. The red and white wine data sets have a large gap in observations, so the visualizations needed to stay separate to show their variable outliers better.

red_outlier <- red_df %>%
  pivot_longer(cols = residual.sugar,
               names_to = "variable", 
               values_to = "value")

ggplot(red_outlier, aes(x = variable, y = value, fill = variable)) +
  geom_boxplot(fill = "firebrick", outlier.color = "red", outlier.size = 2) +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(title = "Red Wine: Residual Sugar",
       x = "Variable",
       y = "Value") 

red_outlier <- red_df %>%
  pivot_longer(cols = chlorides,
               names_to = "variable", 
               values_to = "value")

ggplot(red_outlier, aes(x = variable, y = value, fill = variable)) +
  geom_boxplot(fill = "seagreen", outlier.color = "red", outlier.size = 2) +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(title = "Red Wine: Chlorides",
       x = "Variable",
       y = "Value") 

red_outlier <- red_df %>%
  pivot_longer(cols = sulphates,
               names_to = "variable", 
               values_to = "value")

ggplot(red_outlier, aes(x = variable, y = value, fill = variable)) +
  geom_boxplot(fill = "orchid4", outlier.color = "red", outlier.size = 2) +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(title = "Red Wine: Sulphates",
       x = "Variable",
       y = "Value") 

white_outlier <- white_df %>%
  pivot_longer(cols = residual.sugar,
               names_to = "variable", 
               values_to = "value")

ggplot(white_outlier, aes(x = variable, y = value, fill = variable)) +
  geom_boxplot(fill = "firebrick", outlier.color = "red", outlier.size = 2) +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(title = "White Wine: Residual Sugar",
       x = "Variable",
       y = "Value") 

white_outlier <- white_df %>%
  pivot_longer(cols = chlorides,
               names_to = "variable", 
               values_to = "value")

ggplot(red_outlier, aes(x = variable, y = value, fill = variable)) +
  geom_boxplot(fill = "seagreen", outlier.color = "red", outlier.size = 2) +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(title = "White Wine: Chlorides",
       x = "Variable",
       y = "Value") 

white_outlier <- white_df %>%
  pivot_longer(cols = sulphates,
               names_to = "variable", 
               values_to = "value")

ggplot(red_outlier, aes(x = variable, y = value, fill = variable)) +
  geom_boxplot(fill = "orchid4", outlier.color = "red", outlier.size = 2) +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(title = "White Wine: Sulphates",
       x = "Variable",
       y = "Value") 

Concerns about the Data Quality

summary(red_df)
##  fixed.acidity   volatile.acidity  citric.acid    residual.sugar  
##  Min.   : 4.60   Min.   :0.1200   Min.   :0.000   Min.   : 0.900  
##  1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090   1st Qu.: 1.900  
##  Median : 7.90   Median :0.5200   Median :0.260   Median : 2.200  
##  Mean   : 8.32   Mean   :0.5278   Mean   :0.271   Mean   : 2.539  
##  3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420   3rd Qu.: 2.600  
##  Max.   :15.90   Max.   :1.5800   Max.   :1.000   Max.   :15.500  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.00       Min.   :0.9901  
##  1st Qu.:0.07000   1st Qu.: 7.00       1st Qu.: 22.00       1st Qu.:0.9956  
##  Median :0.07900   Median :14.00       Median : 38.00       Median :0.9968  
##  Mean   :0.08747   Mean   :15.87       Mean   : 46.47       Mean   :0.9967  
##  3rd Qu.:0.09000   3rd Qu.:21.00       3rd Qu.: 62.00       3rd Qu.:0.9978  
##  Max.   :0.61100   Max.   :72.00       Max.   :289.00       Max.   :1.0037  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.740   Min.   :0.3300   Min.   : 8.40   Min.   :3.000  
##  1st Qu.:3.210   1st Qu.:0.5500   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.310   Median :0.6200   Median :10.20   Median :6.000  
##  Mean   :3.311   Mean   :0.6581   Mean   :10.42   Mean   :5.636  
##  3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90   Max.   :8.000

In terms of the quality of the data there are some extreme values such as risidual sugar having 15g/dm^3 which could possibly be a misinput, but it could also represent a legitimate observation of wines with naturally higher sugar levels.. There is some skewness in some of the graphs, but that will discussed in a different section. The datasets are generally clean with no missing values as previously shown. The outliers and extreme values don’t necessarily invalidate the data set but require more analysis and double-checking.

Summarizing and Displaying Statistics

Each variable will be summarized with sample size, mean, median, standard deviation, minimum, maximum, and skew. These statistics will show the shape, variability, and basic but important information about the possible tendencies that the variable may have. It will be shown using the psych package which provides basic descriptive statistics.

# Use psych package to obtain summary statistics on red_df
red_wine_summary_stat <- psych::describe(red_df)

# View specific stats
red_wine_summary_stat[, c("n", "mean", "median", "sd", "min", "max", "skew")]
##                         n  mean median    sd  min    max skew
## fixed.acidity        1599  8.32   7.90  1.74 4.60  15.90 0.98
## volatile.acidity     1599  0.53   0.52  0.18 0.12   1.58 0.67
## citric.acid          1599  0.27   0.26  0.19 0.00   1.00 0.32
## residual.sugar       1599  2.54   2.20  1.41 0.90  15.50 4.53
## chlorides            1599  0.09   0.08  0.05 0.01   0.61 5.67
## free.sulfur.dioxide  1599 15.87  14.00 10.46 1.00  72.00 1.25
## total.sulfur.dioxide 1599 46.47  38.00 32.90 6.00 289.00 1.51
## density              1599  1.00   1.00  0.00 0.99   1.00 0.07
## pH                   1599  3.31   3.31  0.15 2.74   4.01 0.19
## sulphates            1599  0.66   0.62  0.17 0.33   2.00 2.42
## alcohol              1599 10.42  10.20  1.07 8.40  14.90 0.86
## quality              1599  5.64   6.00  0.81 3.00   8.00 0.22
# Use psych package to obtain summary statistics on white_df
white_wine_summary_stat <- psych::describe(white_df)

# View specific stats
white_wine_summary_stat[, c("n", "mean", "median", "sd", "min", "max", "skew")]
##                         n   mean median    sd  min    max skew
## fixed.acidity        4898   6.85   6.80  0.84 3.80  14.20 0.65
## volatile.acidity     4898   0.28   0.26  0.10 0.08   1.10 1.58
## citric.acid          4898   0.33   0.32  0.12 0.00   1.66 1.28
## residual.sugar       4898   6.39   5.20  5.07 0.60  65.80 1.08
## chlorides            4898   0.05   0.04  0.02 0.01   0.35 5.02
## free.sulfur.dioxide  4898  35.31  34.00 17.01 2.00 289.00 1.41
## total.sulfur.dioxide 4898 138.36 134.00 42.50 9.00 440.00 0.39
## density              4898   0.99   0.99  0.00 0.99   1.04 0.98
## pH                   4898   3.19   3.18  0.15 2.72   3.82 0.46
## sulphates            4898   0.49   0.47  0.11 0.22   1.08 0.98
## alcohol              4898  10.51  10.40  1.23 8.00  14.20 0.49
## quality              4898   5.88   6.00  0.89 3.00   9.00 0.16

Distribution of Variables

I think that a standard histogram would be a good way to show the continuous measurements, as we would be able to see the frequency of certain values in the variable. The graph would show a quick visual summary that allows us to see trends, outliers, and skewness.

# Making a new attribute for wine color
red_df$wine_color <- "Red"
white_df$wine_color <- "White"

# Putting the two datasets together
wine <- bind_rows(red_df, white_df)
red_fix <- ggplot(red_df, aes(x = residual.sugar)) +
  geom_histogram(binwidth = 1, fill = "darkred", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Fixed Acidity (Red Wine)",
       x = "Fixed Acidity",
       y = "Frequency")

white_fix <- ggplot(white_df, aes(x = residual.sugar)) +
  geom_histogram(binwidth = 2, fill = "goldenrod", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Fixed Acidity (Red Wine)",
       x = "Fixed Acidity",
       y = "Frequency")

red_fix + white_fix

red_vol <- ggplot(red_df, aes(x = volatile.acidity)) +
  geom_histogram(binwidth = 0.05, fill = "darkred", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Volatile Acidity",
       x = "Volatile Acidity",
       y = "Frequency")

white_vol <- ggplot(white_df, aes(x = volatile.acidity)) +
  geom_histogram(binwidth = 0.05, fill = "goldenrod", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Volatile Acidity",
       x = "Volatile Acidity",
       y = "Frequency")

red_vol + white_vol

red_citr <- ggplot(red_df, aes(x = citric.acid)) +
  geom_histogram(binwidth = 0.02, fill = "darkred", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Citric Acid",
       x = "Citric Acid",
       y = "Frequency")

white_citr <- ggplot(white_df, aes(x = citric.acid)) +
  geom_histogram(binwidth = 0.05, fill = "goldenrod", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Citric Acid",
       x = "Citric Acid",
       y = "Frequency")

red_citr + white_citr

red_res <- ggplot(red_df, aes(x = residual.sugar)) +
  geom_histogram(binwidth = 2, fill = "darkred", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Residual Sugar",
       x = "Residual Acid",
       y = "Frequency")

white_res <- ggplot(white_df, aes(x = residual.sugar)) +
  geom_histogram(binwidth = 5, fill = "goldenrod", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Residual Sugar",
       x = "Residual Acid",
       y = "Frequency")

red_res + white_res

red_chl <- ggplot(red_df, aes(x = chlorides)) +
  geom_histogram(binwidth = 0.02, fill = "darkred", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Chlorides",
       x = "Chlorides",
       y = "Frequency")

white_chl <- ggplot(white_df, aes(x = chlorides)) +
  geom_histogram(binwidth = 0.02, fill = "goldenrod", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Chlorides",
       x = "Chlorides",
       y = "Frequency")

red_chl + white_chl

red_frsul <- ggplot(red_df, aes(x = free.sulfur.dioxide)) +
  geom_histogram(binwidth = 5, fill = "darkred", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Free Sulfur Dioxide",
       x = "Free Sulfur Dioxide",
       y = "Frequency")

white_frsul <- ggplot(white_df, aes(x = free.sulfur.dioxide)) +
  geom_histogram(binwidth = 5, fill = "goldenrod", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Free Sulfur Dioxide",
       x = "Free Sulfur Dioxide",
       y = "Frequency")

red_frsul + white_frsul

red_ttlsul <- ggplot(red_df, aes(x = total.sulfur.dioxide)) +
  geom_histogram(binwidth = 10, fill = "darkred", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Total Sulfur Dioxide",
       x = "Total Sulfur Dioxide",
       y = "Frequency")

white_ttlsul <- ggplot(white_df, aes(x = total.sulfur.dioxide)) +
  geom_histogram(binwidth = 10, fill = "goldenrod", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Total Sulfur Dioxide",
       x = "Total Sulfur Dioxide",
       y = "Frequency")

red_ttlsul + white_ttlsul

red_den <- ggplot(red_df, aes(x = density)) +
  geom_histogram(binwidth = 0.02, fill = "darkred", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Density",
       x = "Density",
       y = "Frequency")

white_den <- ggplot(white_df, aes(x = density)) +
  geom_histogram(binwidth = 0.02, fill = "goldenrod", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Density",
       x = "Density",
       y = "Frequency")

red_den + white_den

red_ph <- ggplot(red_df, aes(x = pH)) +
  geom_histogram(binwidth = 0.25, fill = "darkred", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of pH",
       x = "pH",
       y = "Frequency")

white_ph <- ggplot(white_df, aes(x = pH)) +
  geom_histogram(binwidth = 0.25, fill = "goldenrod", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of pH",
       x = "pH",
       y = "Frequency")

red_ph + white_ph

red_sulph <- ggplot(red_df, aes(x = sulphates)) +
  geom_histogram(binwidth = 0.1, fill = "darkred", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Sulphates",
       x = "Sulphates",
       y = "Frequency")

white_sulph <- ggplot(white_df, aes(x = sulphates)) +
  geom_histogram(binwidth = 0.1, fill = "goldenrod", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Sulphates",
       x = "Sulphates",
       y = "Frequency")

red_sulph + white_sulph

red_alc <- ggplot(red_df, aes(x = alcohol)) +
  geom_histogram(binwidth = 0.5, fill = "darkred", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Alcohol",
       x = "Alcohol",
       y = "Frequency")

white_alc <- ggplot(white_df, aes(x = alcohol)) +
  geom_histogram(binwidth = 0.5, fill = "goldenrod", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Alcohol",
       x = "Alcohol",
       y = "Frequency")

red_alc + white_alc

red_qlty <- ggplot(red_df, aes(x = quality)) +
  geom_histogram(binwidth = 1, fill = "darkred", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Quality",
       x = "Quality",
       y = "Frequency")

white_qlty <- ggplot(white_df, aes(x = quality)) +
  geom_histogram(binwidth = 1, fill = "goldenrod", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Quality",
       x = "Quality",
       y = "Frequency")

red_qlty + white_qlty

Skewness

Looking at the histograms created, there is a common trend of variables having a skewness towards the right, especially for the variables that have many outliers such as ‘Residual Sugar’, ‘Chlorides’, and ‘Sulphates’.