# If you don't have tidyverse installed yet, run once:
# install.packages("tidyverse")
library(tidyverse)
library(dplyr)
library(ggplot2)
red <- read.csv("winequality-red.csv", header = TRUE, sep = ";")
# (a) Sample size
nrow(red)
## [1] 1599
# (a) Sample size
summary(red)
##  fixed.acidity   volatile.acidity  citric.acid    residual.sugar  
##  Min.   : 4.60   Min.   :0.1200   Min.   :0.000   Min.   : 0.900  
##  1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090   1st Qu.: 1.900  
##  Median : 7.90   Median :0.5200   Median :0.260   Median : 2.200  
##  Mean   : 8.32   Mean   :0.5278   Mean   :0.271   Mean   : 2.539  
##  3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420   3rd Qu.: 2.600  
##  Max.   :15.90   Max.   :1.5800   Max.   :1.000   Max.   :15.500  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.00       Min.   :0.9901  
##  1st Qu.:0.07000   1st Qu.: 7.00       1st Qu.: 22.00       1st Qu.:0.9956  
##  Median :0.07900   Median :14.00       Median : 38.00       Median :0.9968  
##  Mean   :0.08747   Mean   :15.87       Mean   : 46.47       Mean   :0.9967  
##  3rd Qu.:0.09000   3rd Qu.:21.00       3rd Qu.: 62.00       3rd Qu.:0.9978  
##  Max.   :0.61100   Max.   :72.00       Max.   :289.00       Max.   :1.0037  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.740   Min.   :0.3300   Min.   : 8.40   Min.   :3.000  
##  1st Qu.:3.210   1st Qu.:0.5500   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.310   Median :0.6200   Median :10.20   Median :6.000  
##  Mean   :3.311   Mean   :0.6581   Mean   :10.42   Mean   :5.636  
##  3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90   Max.   :8.000
# Prepare long format for plots
red_long <- red %>%
  pivot_longer(everything(), names_to = "variable", values_to = "value")
# (d, e) Distributions: histograms 
ggplot(red_long, aes(value)) +
  geom_histogram(bins = 30, fill = "#5DADE2", color = "white") +
  facet_wrap(~ variable, scales = "free", ncol = 3) +
  labs(title = "Red Wine — Distributions by Variable", x = NULL, y = "Count") +
  theme_minimal()

# Boxplots
ggplot(red_long, aes(x = variable, y = value)) +
  geom_boxplot(outlier.alpha = 0.6) +
  facet_wrap(~ variable, scales = "free", ncol = 3) +
  labs(title = "Red Wine — Boxplots by Variable", x = NULL, y = "Value") +
  theme_minimal()

Question - Any outliers? Do you have any concerns about the data quality?

I spot some clear high-end ones in residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, sulphates, citric acid, volatile acidity, and alcohol. Fixed acidity has milder extremes. pH and density do not show many outliers at all.

Data quality looks good. No obvious anomalies jump out. All values stay non-negative and fit within what you’d expect for plausible ranges. There are no obvious issues there.

Question: Do you see any skewed distributions?

Yes,several graphs are right-skewed, residual sugar, chlorides, free/total sulfur dioxide, sulphates, citric acid, volatile acidity, and to a lesser extent alcohol and fixed acidity show long right tails. and pH and density look roughly symmetric.