library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(psych)
## Warning: package 'psych' was built under R version 4.4.3
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(knitr)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%() masks ggplot2::%+%()
## ✖ psych::alpha() masks ggplot2::alpha()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Read the dataset (white wine)
df <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep = ";")
# Get descriptive statistics using psych::describe
desc_stats <- describe(df)
# Select and format key columns: mean, SD, min, max, median
desc_table <- desc_stats %>%
select(vars, n, mean, sd, min, max, median) %>%
rename(
Variable = vars,
Count = n,
Mean = mean,
SD = sd,
Min = min,
Max = max,
Median = median
)
# Print as a nice table
kable(desc_table, caption = "Table: Descriptive Statistics of White Wine Dataset")
Variable | Count | Mean | SD | Min | Max | Median | |
---|---|---|---|---|---|---|---|
fixed.acidity | 1 | 4898 | 6.8547877 | 0.8438682 | 3.80000 | 14.20000 | 6.80000 |
volatile.acidity | 2 | 4898 | 0.2782411 | 0.1007945 | 0.08000 | 1.10000 | 0.26000 |
citric.acid | 3 | 4898 | 0.3341915 | 0.1210198 | 0.00000 | 1.66000 | 0.32000 |
residual.sugar | 4 | 4898 | 6.3914149 | 5.0720578 | 0.60000 | 65.80000 | 5.20000 |
chlorides | 5 | 4898 | 0.0457724 | 0.0218480 | 0.00900 | 0.34600 | 0.04300 |
free.sulfur.dioxide | 6 | 4898 | 35.3080849 | 17.0071373 | 2.00000 | 289.00000 | 34.00000 |
total.sulfur.dioxide | 7 | 4898 | 138.3606574 | 42.4980646 | 9.00000 | 440.00000 | 134.00000 |
density | 8 | 4898 | 0.9940274 | 0.0029909 | 0.98711 | 1.03898 | 0.99374 |
pH | 9 | 4898 | 3.1882666 | 0.1510006 | 2.72000 | 3.82000 | 3.18000 |
sulphates | 10 | 4898 | 0.4898469 | 0.1141258 | 0.22000 | 1.08000 | 0.47000 |
alcohol | 11 | 4898 | 10.5142670 | 1.2306206 | 8.00000 | 14.20000 | 10.40000 |
quality | 12 | 4898 | 5.8779094 | 0.8856386 | 3.00000 | 9.00000 | 6.00000 |
# Plot quality distribution
ggplot(df, aes(x = factor(quality))) +
geom_bar(fill = "steelblue") +
labs(title = "Distribution of Wine Quality Scores",
x = "Wine Quality (Score)",
y = "Count") +
theme_minimal()
### Correlation Heatmap of Wine Features
# Calculate correlation matrix
corr_matrix <- cor(df)
# Plot correlation heatmap
corrplot(corr_matrix, method = "color", type = "upper",
tl.col = "black", tl.srt = 45,
title = "Correlation Heatmap of Wine Features")
### Alcohol Content by Wine Quality
# Alcohol vs Quality
ggplot(df, aes(x = factor(quality), y = alcohol)) +
geom_boxplot(fill = "lightgreen") +
labs(title = "Alcohol Content by Wine Quality",
x = "Wine Quality (Score)",
y = "Alcohol (%)") +
theme_minimal()
ggplot(df, aes(x = factor(quality), y = volatile.acidity)) +
geom_boxplot(fill = "tomato") +
labs(title = "Volatile Acidity by Wine Quality",
x = "Wine Quality (Score)",
y = "Volatile Acidity (g/dm³)") +
theme_minimal()
ggsave("quality_distribution.png")
## Saving 7 x 5 in image
ggsave("correlation_heatmap.png")
## Saving 7 x 5 in image
# Train-test split (70-30)
set.seed(123)
trainIndex <- createDataPartition(df$quality, p = 0.7, list = FALSE)
train <- df[trainIndex, ]
test <- df[-trainIndex, ]
# Fit Multiple Linear Regression
mlr_model <- lm(quality ~ ., data = train)
# Model summary
summary(mlr_model)
##
## Call:
## lm(formula = quality ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4802 -0.4905 -0.0488 0.4641 3.1275
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.298e+02 2.086e+01 6.223 5.46e-10 ***
## fixed.acidity 5.471e-02 2.379e-02 2.300 0.0215 *
## volatile.acidity -1.764e+00 1.328e-01 -13.281 < 2e-16 ***
## citric.acid 7.488e-02 1.136e-01 0.659 0.5098
## residual.sugar 7.074e-02 8.575e-03 8.249 2.24e-16 ***
## chlorides -5.646e-01 6.428e-01 -0.878 0.3799
## free.sulfur.dioxide 5.183e-03 1.038e-03 4.995 6.17e-07 ***
## total.sulfur.dioxide -5.857e-04 4.477e-04 -1.308 0.1909
## density -1.300e+02 2.116e+01 -6.142 9.11e-10 ***
## pH 7.197e-01 1.224e-01 5.880 4.49e-09 ***
## sulphates 6.718e-01 1.178e-01 5.705 1.26e-08 ***
## alcohol 2.084e-01 2.694e-02 7.736 1.34e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7488 on 3417 degrees of freedom
## Multiple R-squared: 0.2855, Adjusted R-squared: 0.2832
## F-statistic: 124.1 on 11 and 3417 DF, p-value: < 2.2e-16
# Predictions
mlr_pred <- predict(mlr_model, newdata = test)
# Evaluate performance
mlr_rmse <- RMSE(mlr_pred, test$quality)
mlr_r2 <- R2(mlr_pred, test$quality)
mlr_mae <- MAE(mlr_pred, test$quality)
cat("MLR RMSE:", mlr_rmse, "\n")
## MLR RMSE: 0.7594318
cat("MLR R-squared:", mlr_r2, "\n")
## MLR R-squared: 0.2695063
cat("MLR MAE:", mlr_mae, "\n")
## MLR MAE: 0.5896617
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
# Fit Random Forest with 500 trees
set.seed(123)
rf_model <- randomForest(quality ~ ., data = train, ntree = 500, importance = TRUE)
# Model summary
print(rf_model)
##
## Call:
## randomForest(formula = quality ~ ., data = train, ntree = 500, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.3932654
## % Var explained: 49.71
# Feature importance plot
varImpPlot(rf_model)
# Predictions
rf_pred <- predict(rf_model, newdata = test)
# Evaluate performance
rf_rmse <- RMSE(rf_pred, test$quality)
rf_r2 <- R2(rf_pred, test$quality)
rf_mae <- MAE(rf_pred, test$quality)
cat("RF RMSE:", rf_rmse, "\n")
## RF RMSE: 0.6089339
cat("RF R-squared:", rf_r2, "\n")
## RF R-squared: 0.5401387
cat("RF MAE:", rf_mae, "\n")
## RF MAE: 0.4447028
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.4.3
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(Matrix)
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
# Prepare data (convert to matrix format)
train_matrix <- as.matrix(train[, -which(names(train) == "quality")])
train_label <- train$quality
test_matrix <- as.matrix(test[, -which(names(test) == "quality")])
test_label <- test$quality
# Convert to DMatrix
dtrain <- xgb.DMatrix(data = train_matrix, label = train_label)
dtest <- xgb.DMatrix(data = test_matrix, label = test_label)
# Set parameters
params <- list(
objective = "reg:squarederror",
eval_metric = "rmse",
max_depth = 6,
eta = 0.1,
subsample = 0.8
)
# Train XGBoost
set.seed(123)
xgb_model <- xgb.train(
params = params,
data = dtrain,
nrounds = 100,
watchlist = list(train = dtrain, eval = dtest),
early_stopping_rounds = 10,
verbose = 0
)
# Predictions
xgb_pred <- predict(xgb_model, newdata = dtest)
# Evaluate performance
xgb_rmse <- RMSE(xgb_pred, test_label)
xgb_r2 <- R2(xgb_pred, test_label)
xgb_mae <- MAE(xgb_pred, test_label)
cat("XGBoost RMSE:", xgb_rmse, "\n")
## XGBoost RMSE: 0.6355915
cat("XGBoost R-squared:", xgb_r2, "\n")
## XGBoost R-squared: 0.4886706
cat("XGBoost MAE:", xgb_mae, "\n")
## XGBoost MAE: 0.4811395
df_perf <- data.frame(
Model = c("Multiple Linear Regression", "XGBoost", "Random Forest"),
RMSE = c(0.7594, 0.6356, 0.6089),
R_squared = c(0.2695, 0.4887, 0.5401),
MAE = c(0.5897, 0.4811, 0.4447)
)
# Reshape to long format
df_long <- df_perf %>%
pivot_longer(cols = c(RMSE, R_squared, MAE), names_to = "Metric", values_to = "Value")
# Plot with color by Model, removing x-axis labels
ggplot(df_long, aes(x = Model, y = Value, fill = Model)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~ Metric, scales = "free_y") +
labs(title = "Model Performance Comparison",
x = "Model", y = "Value") +
theme_minimal() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())