# Libraires
library(ggplot2)

Dataset description

The red wine quality dataset is a collection of data related to red variants of Portuguese “Vinho Verde” wine. It provides valuable insights into the physicochemical attributes of these wines and their corresponding sensory quality ratings.

Source: Kaggle (www.kaggle.com)

It consists of variables such as:

Fixed Acidity: The concentration of non-volatile acids in the wine.

Volatile Acidity: The concentration of acetic acid in the wine.

Citric Acid: The amount of citric acid in the wine.

Residual Sugar: The remaining sugar content in the wine after fermentation.

Chlorides: The concentration of salts (chlorides) in the wine.

Free Sulfur Dioxide: The level of free sulfur dioxide, which serves as an antimicrobial and antioxidant agent.

Total Sulfur Dioxide: The total amount of sulfur dioxide, including both free and bound forms.

Density: The density of the wine, which is related to its alcohol content and sweetness.

pH: A measure of the wine’s acidity or alkalinity.

Sulphates: The concentration of sulfur dioxide. Alcohol: The alcohol content of the wine.

setwd("/Users/olix/Desktop/Advanced Visualization/archive")
getwd()
## [1] "/Users/olix/Desktop/Advanced Visualization/archive"
# Import database
wines <- read.csv('winequality-red.csv',sep=',',dec='.')
head(wines)
##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1           7.4             0.70        0.00            1.9     0.076
## 2           7.8             0.88        0.00            2.6     0.098
## 3           7.8             0.76        0.04            2.3     0.092
## 4          11.2             0.28        0.56            1.9     0.075
## 5           7.4             0.70        0.00            1.9     0.076
## 6           7.4             0.66        0.00            1.8     0.075
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                  11                   34  0.9978 3.51      0.56     9.4
## 2                  25                   67  0.9968 3.20      0.68     9.8
## 3                  15                   54  0.9970 3.26      0.65     9.8
## 4                  17                   60  0.9980 3.16      0.58     9.8
## 5                  11                   34  0.9978 3.51      0.56     9.4
## 6                  13                   40  0.9978 3.51      0.56     9.4
##   quality
## 1       5
## 2       5
## 3       5
## 4       6
## 5       5
## 6       5
str(wines)
## 'data.frame':    1599 obs. of  12 variables:
##  $ fixed.acidity       : num  7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
##  $ volatile.acidity    : num  0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
##  $ citric.acid         : num  0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
##  $ residual.sugar      : num  1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
##  $ chlorides           : num  0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
##  $ free.sulfur.dioxide : num  11 25 15 17 11 13 15 15 9 17 ...
##  $ total.sulfur.dioxide: num  34 67 54 60 34 40 59 21 18 102 ...
##  $ density             : num  0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num  3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
##  $ sulphates           : num  0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
##  $ alcohol             : num  9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
##  $ quality             : int  5 5 5 6 5 5 5 7 7 5 ...

Scatterplots

Quality vs Alcohol by Volume

p <- ggplot(data = wines, aes(x = alcohol, y = quality)) +
  geom_point(aes(color = quality), size = 3, alpha = 0.6) +
  labs(x = "Alcohol by volume",
       y = "Quality",
       title = "Quality vs Alcohol by Volume") +
  scale_color_gradient(low = "lightgreen", high = "darkgreen")

# customize the appearance of the plot
p1 <- p +
  theme_minimal() +
  theme(
    plot.title = element_text(color = 'darkgreen', size = 15, face = 'bold'),
    axis.title = element_text(face = "bold", color = "darkgreen"),
    axis.text = element_text(size = 12),
    axis.line = element_line(color = "black", size = 0.5),
    panel.grid.major = element_line(size = 0.5, colour = "lightgreen"),
    panel.grid.major.x = element_blank(),
    panel.background = element_rect(fill = "white", colour = "white"),
    plot.background = element_rect(fill = "aliceblue")
  )
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# remove legend for the color scale
p1 <- p1 + guides(color = FALSE)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p1

In the above scatterplot depicting the relationship between “Alcohol by volume” and “Quality” for red wines, we can see evident horizontal lines. These lines indicate that there is little to no correlation between the two variables.

Fixed Acidity vs pH

p <- ggplot(data = wines, aes(x = fixed.acidity, y = pH)) +
  geom_point(aes(color = fixed.acidity), size = 3, alpha = 0.6) +
  theme_minimal() +
  labs(x = "Fixed Acidity",
       y = "pH",
       title = "Fixed Acidity vs pH") +
  scale_color_gradient(low = "lightgreen", high = "darkgreen")

#the appearance of the plot
p1 <- p +
  theme(
    plot.title = element_text(color = 'darkgreen', size = 18, face = 'bold'),
    axis.title = element_text(face = "bold", color = "darkgreen"),
    axis.text = element_text(size = 12),
    axis.line = element_line(color = "black", size = 0.5),
    panel.grid.minor.y = element_line(size = 0.5, colour = "gray80"),
    panel.background = element_rect(fill = "white", colour = "white"),
    plot.background = element_rect(fill = "aliceblue")
  )

# remove legend for the color scale
p1 <- p1 + guides(color = FALSE)
p1

From the plot we can see the negative correlation between “Fixed Acidity” and “pH,” indicating that wines with higher fixed acidity have lower pH values

p <- ggplot(data = wines, aes(x = alcohol, y = residual.sugar)) +
  geom_point(aes(color = residual.sugar), size = 3, alpha = 0.6) +
  labs(x = "Alcohol by volume",
       y = "Residual Sugar",
       title = "Relationship between Alcohol and Residual Sugar") +
  scale_color_gradient(low = "lightgreen", high = "darkgreen")

# customize the appearance of the plot
p1 <- p +
  theme_minimal() +
  theme(
    plot.title = element_text(color = 'darkgreen', size = 18, face = 'bold'),
    axis.title = element_text(face = "bold", color = "darkgreen"),
    axis.text = element_text(size = 12),
    axis.line = element_line(color = "black", size = 0.5),
    panel.grid.major = element_line(size = 0.5, colour = "lightgreen"),  # Add grid lines
    panel.grid.major.x = element_blank(),  # Remove vertical grid lines
    panel.background = element_rect(fill = "white", colour = "white"),
    plot.background = element_rect(fill = "aliceblue")
  )

# remove legend for the color scale
p1 <- p1 + guides(color = FALSE)
p1

From the plot we can see that there is no clear realtion between the two variables. What is more the majority of the data points have very similar “Residual Sugar” which oscialtes around value of 2.