redwine = read.csv("C:\\Users\\brian\\OneDrive\\Documents\\Stat Learning\\red wine.csv")
whitewine = read.csv("C:\\Users\\brian\\OneDrive\\Documents\\Stat Learning\\white wine.csv")
redwine$wine_type <- "red"
whitewine$wine_type <- "white"
sum(is.na(redwine))
## [1] 0
sum(is.na(whitewine))
## [1] 0
# Combine the datasets
wine_data <- rbind(redwine, whitewine)
wine_data$wine_type <- as.numeric(as.factor(wine_data$wine_type))
head(wine_data)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.4 0.70 0.00 1.9 0.076
## 2 7.8 0.88 0.00 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.70 0.00 1.9 0.076
## 6 7.4 0.66 0.00 1.8 0.075
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.20 0.68 9.8
## 3 15 54 0.9970 3.26 0.65 9.8
## 4 17 60 0.9980 3.16 0.58 9.8
## 5 11 34 0.9978 3.51 0.56 9.4
## 6 13 40 0.9978 3.51 0.56 9.4
## quality wine_type
## 1 5 1
## 2 5 1
## 3 5 1
## 4 6 1
## 5 5 1
## 6 5 1
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.95 loaded
library(GGally)
## Warning: package 'GGally' was built under R version 4.3.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
str(wine_data)
## 'data.frame': 6497 obs. of 13 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
## $ wine_type : num 1 1 1 1 1 1 1 1 1 1 ...
summary(wine_data)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.400 1st Qu.:0.2300 1st Qu.:0.2500 1st Qu.: 1.800
## Median : 7.000 Median :0.2900 Median :0.3100 Median : 3.000
## Mean : 7.215 Mean :0.3397 Mean :0.3186 Mean : 5.443
## 3rd Qu.: 7.700 3rd Qu.:0.4000 3rd Qu.:0.3900 3rd Qu.: 8.100
## Max. :15.900 Max. :1.5800 Max. :1.6600 Max. :65.800
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.00900 Min. : 1.00 Min. : 6.0 Min. :0.9871
## 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 77.0 1st Qu.:0.9923
## Median :0.04700 Median : 29.00 Median :118.0 Median :0.9949
## Mean :0.05603 Mean : 30.53 Mean :115.7 Mean :0.9947
## 3rd Qu.:0.06500 3rd Qu.: 41.00 3rd Qu.:156.0 3rd Qu.:0.9970
## Max. :0.61100 Max. :289.00 Max. :440.0 Max. :1.0390
## pH sulphates alcohol quality
## Min. :2.720 Min. :0.2200 Min. : 8.00 Min. :3.000
## 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.210 Median :0.5100 Median :10.30 Median :6.000
## Mean :3.219 Mean :0.5313 Mean :10.49 Mean :5.818
## 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.:11.30 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :9.000
## wine_type
## Min. :1.000
## 1st Qu.:2.000
## Median :2.000
## Mean :1.754
## 3rd Qu.:2.000
## Max. :2.000
table(wine_data$wine_type)
##
## 1 2
## 1599 4898
table(wine_data$quality)
##
## 3 4 5 6 7 8 9
## 30 216 2138 2836 1079 193 5
wine_data$wine_type <- factor(wine_data$wine_type, levels = c(1, 2), labels = c("red", "white"))
ggplot(wine_data, aes(x = factor(quality), fill = wine_type)) +
geom_bar(position = "dodge") +
labs(title = "Wine Quality Distribution by Type", x = "Quality Score", y = "Count") +
scale_fill_manual(values = c("red" = "firebrick", "white" = "goldenrod"))

ggplot(wine_data, aes(x = factor(quality), y = alcohol, fill = wine_type)) +
geom_boxplot() +
labs(title = "Alcohol Content by Wine Quality", x = "Quality", y = "Alcohol") +
scale_fill_manual(values = c("red" = "firebrick", "white" = "goldenrod"))

#Linear Regression
set.seed(123)
train_index <- sample(1:nrow(wine_data), 0.8 * nrow(wine_data))
train_data <- wine_data[train_index, ]
test_data <- wine_data[-train_index, ]
# Fit linear regression model
lm_model <- lm(quality ~ ., data = train_data)
# Predict
lm_pred <- predict(lm_model, newdata = test_data)
# Evaluate
lm_rmse <- sqrt(mean((lm_pred - test_data$quality)^2))
summary(lm_model)
##
## Call:
## lm(formula = quality ~ ., data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5633 -0.4718 -0.0419 0.4588 2.7600
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.070e+02 1.531e+01 6.991 3.08e-12 ***
## fixed.acidity 8.472e-02 1.731e-02 4.895 1.01e-06 ***
## volatile.acidity -1.538e+00 9.068e-02 -16.962 < 2e-16 ***
## citric.acid -5.972e-02 8.932e-02 -0.669 0.5038
## residual.sugar 6.467e-02 6.500e-03 9.949 < 2e-16 ***
## chlorides -7.691e-01 3.669e-01 -2.096 0.0361 *
## free.sulfur.dioxide 5.373e-03 8.801e-04 6.104 1.11e-09 ***
## total.sulfur.dioxide -1.460e-03 3.621e-04 -4.032 5.61e-05 ***
## density -1.062e+02 1.553e+01 -6.839 8.92e-12 ***
## pH 5.037e-01 1.001e-01 5.033 5.00e-07 ***
## sulphates 7.110e-01 8.512e-02 8.354 < 2e-16 ***
## alcohol 2.247e-01 1.967e-02 11.425 < 2e-16 ***
## wine_typewhite -4.035e-01 6.279e-02 -6.426 1.43e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7336 on 5184 degrees of freedom
## Multiple R-squared: 0.3021, Adjusted R-squared: 0.3005
## F-statistic: 187 on 12 and 5184 DF, p-value: < 2.2e-16
#Random Forest
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
# Train model
rf_model <- randomForest(quality ~ ., data = train_data, ntree = 500, importance = TRUE)
# Predict
rf_pred <- predict(rf_model, newdata = test_data)
# Evaluate
rf_rmse <- sqrt(mean((rf_pred - test_data$quality)^2))
# Feature importance
importance(rf_model)
## %IncMSE IncNodePurity
## fixed.acidity 50.77460 225.13359
## volatile.acidity 81.59938 445.63538
## citric.acid 52.45018 274.29379
## residual.sugar 61.54419 281.68639
## chlorides 47.61695 297.51923
## free.sulfur.dioxide 77.14629 330.68267
## total.sulfur.dioxide 52.83166 284.70721
## density 42.08119 382.53707
## pH 63.05055 254.71268
## sulphates 65.31117 277.63214
## alcohol 87.54968 758.43166
## wine_type 11.59997 11.39027
varImpPlot(rf_model)

##Binary SVR
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.3
# 1. Create a binary target variable: high quality = 1, low = 0
wine_data$class_quality <- ifelse(wine_data$quality >= 7, 1, 0)
wine_data$class_quality <- as.factor(wine_data$class_quality)
# 2. Convert wine_type to numeric (if it's still a factor/char)
wine_data$wine_type <- as.numeric(as.factor(wine_data$wine_type))
# 3. Split data
set.seed(123)
sample_index <- sample(1:nrow(wine_data), 0.7 * nrow(wine_data))
train_data <- wine_data[sample_index, ]
test_data <- wine_data[-sample_index, ]
# 4. Train SVM with linear kernel
svmfit <- svm(class_quality ~ . -quality, data = train_data, kernel = "linear", cost = 10, scale = TRUE)
# 5. Predict on test set
pred_svm <- predict(svmfit, newdata = test_data)
# 6. Confusion matrix
table(Predicted = pred_svm, Actual = test_data$class_quality)
## Actual
## Predicted 0 1
## 0 1569 381
## 1 0 0
##Multi-Class SVR
wine_data$quality <- as.factor(wine_data$quality)
# Split data
set.seed(123)
sample_index <- sample(1:nrow(wine_data), 0.7 * nrow(wine_data))
train_data <- wine_data[sample_index, ]
test_data <- wine_data[-sample_index, ]
# Train SVM
svmfit <- svm(quality ~ . -class_quality, data = train_data, kernel = "linear", cost = 10, scale = TRUE)
# Predict
pred_quality <- predict(svmfit, newdata = test_data)
# Confusion matrix
table(Predicted = pred_quality, Actual = test_data$quality)
## Actual
## Predicted 3 4 5 6 7 8 9
## 3 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0
## 5 1 35 376 219 28 3 0
## 6 6 19 243 670 293 54 3
## 7 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0
#Support Vector Machines (SVM)
library(e1071)
# 1. Convert wine type (if not already numeric or factor)
wine_data$wine_type <- as.factor(wine_data$wine_type)
# 2. Convert quality to factor for classification
wine_data$quality <- as.factor(wine_data$quality)
# 3. Train/Test Split
set.seed(123)
train_idx <- sample(1:nrow(wine_data), 0.7 * nrow(wine_data))
train_data <- wine_data[train_idx, ]
test_data <- wine_data[-train_idx, ]
# 4. Train Radial SVM
svm_rbf <- svm(quality ~ .,
data = train_data,
kernel = "radial", # non-linear kernel
cost = 1, # penalty for misclassification
gamma = 0.1, # controls the influence of a single training example
scale = TRUE)
# 5. Predict on test data
pred_rbf <- predict(svm_rbf, newdata = test_data)
# 6. Evaluate performance
conf_mat <- table(Predicted = pred_rbf, Actual = test_data$quality)
print(conf_mat)
## Actual
## Predicted 3 4 5 6 7 8 9
## 3 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0
## 5 1 39 420 190 2 0 0
## 6 6 15 199 699 0 0 0
## 7 0 0 0 0 319 57 3
## 8 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0
# Optional: Accuracy
accuracy <- sum(diag(conf_mat)) / sum(conf_mat)
cat("Accuracy:", round(accuracy, 4), "\n")
## Accuracy: 0.7374