SLproject

Load necessary libraries and data

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(readr)
library(psych)

## Warning: package 'psych' was built under R version 4.4.3

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(knitr)
library(corrplot)

## Warning: package 'corrplot' was built under R version 4.4.3

## corrplot 0.95 loaded

library(caret)

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: lattice

library(ggplot2)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%()    masks ggplot2::%+%()
## ✖ psych::alpha()  masks ggplot2::alpha()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# Read the dataset (white wine)
df <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep = ";")

# Get descriptive statistics using psych::describe
desc_stats <- describe(df)

# Select and format key columns: mean, SD, min, max, median
desc_table <- desc_stats %>%
  select(vars, n, mean, sd, min, max, median) %>%
  rename(
    Variable = vars,
    Count = n,
    Mean = mean,
    SD = sd,
    Min = min,
    Max = max,
    Median = median
  )

# Print as a nice table
kable(desc_table, caption = "Table: Descriptive Statistics of White Wine Dataset")

Table: Descriptive Statistics of White Wine Dataset
	Variable	Count	Mean	SD	Min	Max	Median
fixed.acidity	1	4898	6.8547877	0.8438682	3.80000	14.20000	6.80000
volatile.acidity	2	4898	0.2782411	0.1007945	0.08000	1.10000	0.26000
citric.acid	3	4898	0.3341915	0.1210198	0.00000	1.66000	0.32000
residual.sugar	4	4898	6.3914149	5.0720578	0.60000	65.80000	5.20000
chlorides	5	4898	0.0457724	0.0218480	0.00900	0.34600	0.04300
free.sulfur.dioxide	6	4898	35.3080849	17.0071373	2.00000	289.00000	34.00000
total.sulfur.dioxide	7	4898	138.3606574	42.4980646	9.00000	440.00000	134.00000
density	8	4898	0.9940274	0.0029909	0.98711	1.03898	0.99374
pH	9	4898	3.1882666	0.1510006	2.72000	3.82000	3.18000
sulphates	10	4898	0.4898469	0.1141258	0.22000	1.08000	0.47000
alcohol	11	4898	10.5142670	1.2306206	8.00000	14.20000	10.40000
quality	12	4898	5.8779094	0.8856386	3.00000	9.00000	6.00000

Distribution of Wine Quality Scores

# Plot quality distribution
ggplot(df, aes(x = factor(quality))) +
  geom_bar(fill = "steelblue") +
  labs(title = "Distribution of Wine Quality Scores",
       x = "Wine Quality (Score)",
       y = "Count") +
  theme_minimal()

### Correlation Heatmap of Wine Features

# Calculate correlation matrix
corr_matrix <- cor(df)

# Plot correlation heatmap
corrplot(corr_matrix, method = "color", type = "upper", 
         tl.col = "black", tl.srt = 45,
         title = "Correlation Heatmap of Wine Features")

### Alcohol Content by Wine Quality

# Alcohol vs Quality
ggplot(df, aes(x = factor(quality), y = alcohol)) +
  geom_boxplot(fill = "lightgreen") +
  labs(title = "Alcohol Content by Wine Quality",
       x = "Wine Quality (Score)",
       y = "Alcohol (%)") +
  theme_minimal()

Volatile Acidity vs Quality

ggplot(df, aes(x = factor(quality), y = volatile.acidity)) +
  geom_boxplot(fill = "tomato") +
  labs(title = "Volatile Acidity by Wine Quality",
       x = "Wine Quality (Score)",
       y = "Volatile Acidity (g/dm³)") +
  theme_minimal()

ggsave("quality_distribution.png")

## Saving 7 x 5 in image

ggsave("correlation_heatmap.png")

## Saving 7 x 5 in image

# Train-test split (70-30)
set.seed(123)
trainIndex <- createDataPartition(df$quality, p = 0.7, list = FALSE)
train <- df[trainIndex, ]
test <- df[-trainIndex, ]

# Fit Multiple Linear Regression
mlr_model <- lm(quality ~ ., data = train)

# Model summary
summary(mlr_model)

## 
## Call:
## lm(formula = quality ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4802 -0.4905 -0.0488  0.4641  3.1275 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.298e+02  2.086e+01   6.223 5.46e-10 ***
## fixed.acidity         5.471e-02  2.379e-02   2.300   0.0215 *  
## volatile.acidity     -1.764e+00  1.328e-01 -13.281  < 2e-16 ***
## citric.acid           7.488e-02  1.136e-01   0.659   0.5098    
## residual.sugar        7.074e-02  8.575e-03   8.249 2.24e-16 ***
## chlorides            -5.646e-01  6.428e-01  -0.878   0.3799    
## free.sulfur.dioxide   5.183e-03  1.038e-03   4.995 6.17e-07 ***
## total.sulfur.dioxide -5.857e-04  4.477e-04  -1.308   0.1909    
## density              -1.300e+02  2.116e+01  -6.142 9.11e-10 ***
## pH                    7.197e-01  1.224e-01   5.880 4.49e-09 ***
## sulphates             6.718e-01  1.178e-01   5.705 1.26e-08 ***
## alcohol               2.084e-01  2.694e-02   7.736 1.34e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7488 on 3417 degrees of freedom
## Multiple R-squared:  0.2855, Adjusted R-squared:  0.2832 
## F-statistic: 124.1 on 11 and 3417 DF,  p-value: < 2.2e-16

# Predictions
mlr_pred <- predict(mlr_model, newdata = test)

# Evaluate performance
mlr_rmse <- RMSE(mlr_pred, test$quality)
mlr_r2 <- R2(mlr_pred, test$quality)
mlr_mae <- MAE(mlr_pred, test$quality)

cat("MLR RMSE:", mlr_rmse, "\n")

## MLR RMSE: 0.7594318

cat("MLR R-squared:", mlr_r2, "\n")

## MLR R-squared: 0.2695063

cat("MLR MAE:", mlr_mae, "\n")

## MLR MAE: 0.5896617

Random Forest

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.4.3

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:psych':
## 
##     outlier

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

# Fit Random Forest with 500 trees
set.seed(123)
rf_model <- randomForest(quality ~ ., data = train, ntree = 500, importance = TRUE)

# Model summary
print(rf_model)

## 
## Call:
##  randomForest(formula = quality ~ ., data = train, ntree = 500,      importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 0.3932654
##                     % Var explained: 49.71

# Feature importance plot
varImpPlot(rf_model)

# Predictions
rf_pred <- predict(rf_model, newdata = test)

# Evaluate performance
rf_rmse <- RMSE(rf_pred, test$quality)
rf_r2 <- R2(rf_pred, test$quality)
rf_mae <- MAE(rf_pred, test$quality)

cat("RF RMSE:", rf_rmse, "\n")

## RF RMSE: 0.6089339

cat("RF R-squared:", rf_r2, "\n")

## RF R-squared: 0.5401387

cat("RF MAE:", rf_mae, "\n")

## RF MAE: 0.4447028

XGBoost

library(xgboost)

## Warning: package 'xgboost' was built under R version 4.4.3

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

library(Matrix)

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

# Prepare data (convert to matrix format)
train_matrix <- as.matrix(train[, -which(names(train) == "quality")])
train_label <- train$quality
test_matrix <- as.matrix(test[, -which(names(test) == "quality")])
test_label <- test$quality

# Convert to DMatrix
dtrain <- xgb.DMatrix(data = train_matrix, label = train_label)
dtest <- xgb.DMatrix(data = test_matrix, label = test_label)

# Set parameters
params <- list(
  objective = "reg:squarederror",
  eval_metric = "rmse",
  max_depth = 6,
  eta = 0.1,
  subsample = 0.8
)

# Train XGBoost
set.seed(123)
xgb_model <- xgb.train(
  params = params,
  data = dtrain,
  nrounds = 100,
  watchlist = list(train = dtrain, eval = dtest),
  early_stopping_rounds = 10,
  verbose = 0
)

# Predictions
xgb_pred <- predict(xgb_model, newdata = dtest)

# Evaluate performance
xgb_rmse <- RMSE(xgb_pred, test_label)
xgb_r2 <- R2(xgb_pred, test_label)
xgb_mae <- MAE(xgb_pred, test_label)

cat("XGBoost RMSE:", xgb_rmse, "\n")

## XGBoost RMSE: 0.6355915

cat("XGBoost R-squared:", xgb_r2, "\n")

## XGBoost R-squared: 0.4886706

cat("XGBoost MAE:", xgb_mae, "\n")

## XGBoost MAE: 0.4811395

Model Performsnce

df_perf <- data.frame(
  Model = c("Multiple Linear Regression", "XGBoost", "Random Forest"),
  RMSE = c(0.7594, 0.6356, 0.6089),
  R_squared = c(0.2695, 0.4887, 0.5401),
  MAE = c(0.5897, 0.4811, 0.4447)
)

# Reshape to long format
df_long <- df_perf %>%
  pivot_longer(cols = c(RMSE, R_squared, MAE), names_to = "Metric", values_to = "Value")

# Plot with color by Model, removing x-axis labels
ggplot(df_long, aes(x = Model, y = Value, fill = Model)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ Metric, scales = "free_y") +
  labs(title = "Model Performance Comparison",
       x = "Model", y = "Value") +
  theme_minimal() +
  theme(axis.text.x = element_blank(),  
        axis.ticks.x = element_blank())

SLproject

Sai Pranam

2025-05-03

Load necessary libraries and data

Distribution of Wine Quality Scores

Volatile Acidity vs Quality

Random Forest

XGBoost

Model Performsnce