Load necessary libraries and data

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(psych)
## Warning: package 'psych' was built under R version 4.4.3
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(knitr)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%()    masks ggplot2::%+%()
## ✖ psych::alpha()  masks ggplot2::alpha()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Read the dataset (white wine)
df <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep = ";")

# Get descriptive statistics using psych::describe
desc_stats <- describe(df)

# Select and format key columns: mean, SD, min, max, median
desc_table <- desc_stats %>%
  select(vars, n, mean, sd, min, max, median) %>%
  rename(
    Variable = vars,
    Count = n,
    Mean = mean,
    SD = sd,
    Min = min,
    Max = max,
    Median = median
  )

# Print as a nice table
kable(desc_table, caption = "Table: Descriptive Statistics of White Wine Dataset")
Table: Descriptive Statistics of White Wine Dataset
Variable Count Mean SD Min Max Median
fixed.acidity 1 4898 6.8547877 0.8438682 3.80000 14.20000 6.80000
volatile.acidity 2 4898 0.2782411 0.1007945 0.08000 1.10000 0.26000
citric.acid 3 4898 0.3341915 0.1210198 0.00000 1.66000 0.32000
residual.sugar 4 4898 6.3914149 5.0720578 0.60000 65.80000 5.20000
chlorides 5 4898 0.0457724 0.0218480 0.00900 0.34600 0.04300
free.sulfur.dioxide 6 4898 35.3080849 17.0071373 2.00000 289.00000 34.00000
total.sulfur.dioxide 7 4898 138.3606574 42.4980646 9.00000 440.00000 134.00000
density 8 4898 0.9940274 0.0029909 0.98711 1.03898 0.99374
pH 9 4898 3.1882666 0.1510006 2.72000 3.82000 3.18000
sulphates 10 4898 0.4898469 0.1141258 0.22000 1.08000 0.47000
alcohol 11 4898 10.5142670 1.2306206 8.00000 14.20000 10.40000
quality 12 4898 5.8779094 0.8856386 3.00000 9.00000 6.00000

Distribution of Wine Quality Scores

# Plot quality distribution
ggplot(df, aes(x = factor(quality))) +
  geom_bar(fill = "steelblue") +
  labs(title = "Distribution of Wine Quality Scores",
       x = "Wine Quality (Score)",
       y = "Count") +
  theme_minimal()

### Correlation Heatmap of Wine Features

# Calculate correlation matrix
corr_matrix <- cor(df)

# Plot correlation heatmap
corrplot(corr_matrix, method = "color", type = "upper", 
         tl.col = "black", tl.srt = 45,
         title = "Correlation Heatmap of Wine Features")

### Alcohol Content by Wine Quality

# Alcohol vs Quality
ggplot(df, aes(x = factor(quality), y = alcohol)) +
  geom_boxplot(fill = "lightgreen") +
  labs(title = "Alcohol Content by Wine Quality",
       x = "Wine Quality (Score)",
       y = "Alcohol (%)") +
  theme_minimal()

Volatile Acidity vs Quality

ggplot(df, aes(x = factor(quality), y = volatile.acidity)) +
  geom_boxplot(fill = "tomato") +
  labs(title = "Volatile Acidity by Wine Quality",
       x = "Wine Quality (Score)",
       y = "Volatile Acidity (g/dm³)") +
  theme_minimal()

ggsave("quality_distribution.png")
## Saving 7 x 5 in image
ggsave("correlation_heatmap.png")
## Saving 7 x 5 in image
# Train-test split (70-30)
set.seed(123)
trainIndex <- createDataPartition(df$quality, p = 0.7, list = FALSE)
train <- df[trainIndex, ]
test <- df[-trainIndex, ]

# Fit Multiple Linear Regression
mlr_model <- lm(quality ~ ., data = train)

# Model summary
summary(mlr_model)
## 
## Call:
## lm(formula = quality ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4802 -0.4905 -0.0488  0.4641  3.1275 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.298e+02  2.086e+01   6.223 5.46e-10 ***
## fixed.acidity         5.471e-02  2.379e-02   2.300   0.0215 *  
## volatile.acidity     -1.764e+00  1.328e-01 -13.281  < 2e-16 ***
## citric.acid           7.488e-02  1.136e-01   0.659   0.5098    
## residual.sugar        7.074e-02  8.575e-03   8.249 2.24e-16 ***
## chlorides            -5.646e-01  6.428e-01  -0.878   0.3799    
## free.sulfur.dioxide   5.183e-03  1.038e-03   4.995 6.17e-07 ***
## total.sulfur.dioxide -5.857e-04  4.477e-04  -1.308   0.1909    
## density              -1.300e+02  2.116e+01  -6.142 9.11e-10 ***
## pH                    7.197e-01  1.224e-01   5.880 4.49e-09 ***
## sulphates             6.718e-01  1.178e-01   5.705 1.26e-08 ***
## alcohol               2.084e-01  2.694e-02   7.736 1.34e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7488 on 3417 degrees of freedom
## Multiple R-squared:  0.2855, Adjusted R-squared:  0.2832 
## F-statistic: 124.1 on 11 and 3417 DF,  p-value: < 2.2e-16
# Predictions
mlr_pred <- predict(mlr_model, newdata = test)

# Evaluate performance
mlr_rmse <- RMSE(mlr_pred, test$quality)
mlr_r2 <- R2(mlr_pred, test$quality)
mlr_mae <- MAE(mlr_pred, test$quality)

cat("MLR RMSE:", mlr_rmse, "\n")
## MLR RMSE: 0.7594318
cat("MLR R-squared:", mlr_r2, "\n")
## MLR R-squared: 0.2695063
cat("MLR MAE:", mlr_mae, "\n")
## MLR MAE: 0.5896617

Random Forest

library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
## 
##     outlier
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
# Fit Random Forest with 500 trees
set.seed(123)
rf_model <- randomForest(quality ~ ., data = train, ntree = 500, importance = TRUE)

# Model summary
print(rf_model)
## 
## Call:
##  randomForest(formula = quality ~ ., data = train, ntree = 500,      importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 0.3932654
##                     % Var explained: 49.71
# Feature importance plot
varImpPlot(rf_model)

# Predictions
rf_pred <- predict(rf_model, newdata = test)

# Evaluate performance
rf_rmse <- RMSE(rf_pred, test$quality)
rf_r2 <- R2(rf_pred, test$quality)
rf_mae <- MAE(rf_pred, test$quality)

cat("RF RMSE:", rf_rmse, "\n")
## RF RMSE: 0.6089339
cat("RF R-squared:", rf_r2, "\n")
## RF R-squared: 0.5401387
cat("RF MAE:", rf_mae, "\n")
## RF MAE: 0.4447028

XGBoost

library(xgboost)
## Warning: package 'xgboost' was built under R version 4.4.3
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
library(Matrix)
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
# Prepare data (convert to matrix format)
train_matrix <- as.matrix(train[, -which(names(train) == "quality")])
train_label <- train$quality
test_matrix <- as.matrix(test[, -which(names(test) == "quality")])
test_label <- test$quality

# Convert to DMatrix
dtrain <- xgb.DMatrix(data = train_matrix, label = train_label)
dtest <- xgb.DMatrix(data = test_matrix, label = test_label)

# Set parameters
params <- list(
  objective = "reg:squarederror",
  eval_metric = "rmse",
  max_depth = 6,
  eta = 0.1,
  subsample = 0.8
)

# Train XGBoost
set.seed(123)
xgb_model <- xgb.train(
  params = params,
  data = dtrain,
  nrounds = 100,
  watchlist = list(train = dtrain, eval = dtest),
  early_stopping_rounds = 10,
  verbose = 0
)

# Predictions
xgb_pred <- predict(xgb_model, newdata = dtest)

# Evaluate performance
xgb_rmse <- RMSE(xgb_pred, test_label)
xgb_r2 <- R2(xgb_pred, test_label)
xgb_mae <- MAE(xgb_pred, test_label)

cat("XGBoost RMSE:", xgb_rmse, "\n")
## XGBoost RMSE: 0.6355915
cat("XGBoost R-squared:", xgb_r2, "\n")
## XGBoost R-squared: 0.4886706
cat("XGBoost MAE:", xgb_mae, "\n")
## XGBoost MAE: 0.4811395

Model Performsnce

df_perf <- data.frame(
  Model = c("Multiple Linear Regression", "XGBoost", "Random Forest"),
  RMSE = c(0.7594, 0.6356, 0.6089),
  R_squared = c(0.2695, 0.4887, 0.5401),
  MAE = c(0.5897, 0.4811, 0.4447)
)

# Reshape to long format
df_long <- df_perf %>%
  pivot_longer(cols = c(RMSE, R_squared, MAE), names_to = "Metric", values_to = "Value")

# Plot with color by Model, removing x-axis labels
ggplot(df_long, aes(x = Model, y = Value, fill = Model)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ Metric, scales = "free_y") +
  labs(title = "Model Performance Comparison",
       x = "Model", y = "Value") +
  theme_minimal() +
  theme(axis.text.x = element_blank(),  
        axis.ticks.x = element_blank())