Stats Final Project

redwine = read.csv("C:\\Users\\brian\\OneDrive\\Documents\\Stat Learning\\red wine.csv")
whitewine = read.csv("C:\\Users\\brian\\OneDrive\\Documents\\Stat Learning\\white wine.csv")
redwine$wine_type <- "red"
whitewine$wine_type <- "white"
sum(is.na(redwine))

## [1] 0

sum(is.na(whitewine))

## [1] 0

# Combine the datasets
wine_data <- rbind(redwine, whitewine)
wine_data$wine_type <- as.numeric(as.factor(wine_data$wine_type))
head(wine_data)

##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1           7.4             0.70        0.00            1.9     0.076
## 2           7.8             0.88        0.00            2.6     0.098
## 3           7.8             0.76        0.04            2.3     0.092
## 4          11.2             0.28        0.56            1.9     0.075
## 5           7.4             0.70        0.00            1.9     0.076
## 6           7.4             0.66        0.00            1.8     0.075
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                  11                   34  0.9978 3.51      0.56     9.4
## 2                  25                   67  0.9968 3.20      0.68     9.8
## 3                  15                   54  0.9970 3.26      0.65     9.8
## 4                  17                   60  0.9980 3.16      0.58     9.8
## 5                  11                   34  0.9978 3.51      0.56     9.4
## 6                  13                   40  0.9978 3.51      0.56     9.4
##   quality wine_type
## 1       5         1
## 2       5         1
## 3       5         1
## 4       6         1
## 5       5         1
## 6       5         1

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.3.3

## corrplot 0.95 loaded

library(GGally)

## Warning: package 'GGally' was built under R version 4.3.3

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

str(wine_data)

## 'data.frame':    6497 obs. of  13 variables:
##  $ fixed.acidity       : num  7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
##  $ volatile.acidity    : num  0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
##  $ citric.acid         : num  0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
##  $ residual.sugar      : num  1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
##  $ chlorides           : num  0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
##  $ free.sulfur.dioxide : num  11 25 15 17 11 13 15 15 9 17 ...
##  $ total.sulfur.dioxide: num  34 67 54 60 34 40 59 21 18 102 ...
##  $ density             : num  0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num  3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
##  $ sulphates           : num  0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
##  $ alcohol             : num  9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
##  $ quality             : int  5 5 5 6 5 5 5 7 7 5 ...
##  $ wine_type           : num  1 1 1 1 1 1 1 1 1 1 ...

summary(wine_data)

##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.400   1st Qu.:0.2300   1st Qu.:0.2500   1st Qu.: 1.800  
##  Median : 7.000   Median :0.2900   Median :0.3100   Median : 3.000  
##  Mean   : 7.215   Mean   :0.3397   Mean   :0.3186   Mean   : 5.443  
##  3rd Qu.: 7.700   3rd Qu.:0.4000   3rd Qu.:0.3900   3rd Qu.: 8.100  
##  Max.   :15.900   Max.   :1.5800   Max.   :1.6600   Max.   :65.800  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.00900   Min.   :  1.00      Min.   :  6.0        Min.   :0.9871  
##  1st Qu.:0.03800   1st Qu.: 17.00      1st Qu.: 77.0        1st Qu.:0.9923  
##  Median :0.04700   Median : 29.00      Median :118.0        Median :0.9949  
##  Mean   :0.05603   Mean   : 30.53      Mean   :115.7        Mean   :0.9947  
##  3rd Qu.:0.06500   3rd Qu.: 41.00      3rd Qu.:156.0        3rd Qu.:0.9970  
##  Max.   :0.61100   Max.   :289.00      Max.   :440.0        Max.   :1.0390  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.720   Min.   :0.2200   Min.   : 8.00   Min.   :3.000  
##  1st Qu.:3.110   1st Qu.:0.4300   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.210   Median :0.5100   Median :10.30   Median :6.000  
##  Mean   :3.219   Mean   :0.5313   Mean   :10.49   Mean   :5.818  
##  3rd Qu.:3.320   3rd Qu.:0.6000   3rd Qu.:11.30   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90   Max.   :9.000  
##    wine_type    
##  Min.   :1.000  
##  1st Qu.:2.000  
##  Median :2.000  
##  Mean   :1.754  
##  3rd Qu.:2.000  
##  Max.   :2.000

table(wine_data$wine_type)

## 
##    1    2 
## 1599 4898

table(wine_data$quality)

## 
##    3    4    5    6    7    8    9 
##   30  216 2138 2836 1079  193    5

wine_data$wine_type <- factor(wine_data$wine_type, levels = c(1, 2), labels = c("red", "white"))

ggplot(wine_data, aes(x = factor(quality), fill = wine_type)) +
  geom_bar(position = "dodge") +
  labs(title = "Wine Quality Distribution by Type", x = "Quality Score", y = "Count") +
  scale_fill_manual(values = c("red" = "firebrick", "white" = "goldenrod"))

ggplot(wine_data, aes(x = factor(quality), y = alcohol, fill = wine_type)) +
  geom_boxplot() +
  labs(title = "Alcohol Content by Wine Quality", x = "Quality", y = "Alcohol") +
  scale_fill_manual(values = c("red" = "firebrick", "white" = "goldenrod"))

#Linear Regression
set.seed(123)
train_index <- sample(1:nrow(wine_data), 0.8 * nrow(wine_data))
train_data <- wine_data[train_index, ]
test_data <- wine_data[-train_index, ]

# Fit linear regression model
lm_model <- lm(quality ~ ., data = train_data)

# Predict
lm_pred <- predict(lm_model, newdata = test_data)

# Evaluate
lm_rmse <- sqrt(mean((lm_pred - test_data$quality)^2))
summary(lm_model)

## 
## Call:
## lm(formula = quality ~ ., data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5633 -0.4718 -0.0419  0.4588  2.7600 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.070e+02  1.531e+01   6.991 3.08e-12 ***
## fixed.acidity         8.472e-02  1.731e-02   4.895 1.01e-06 ***
## volatile.acidity     -1.538e+00  9.068e-02 -16.962  < 2e-16 ***
## citric.acid          -5.972e-02  8.932e-02  -0.669   0.5038    
## residual.sugar        6.467e-02  6.500e-03   9.949  < 2e-16 ***
## chlorides            -7.691e-01  3.669e-01  -2.096   0.0361 *  
## free.sulfur.dioxide   5.373e-03  8.801e-04   6.104 1.11e-09 ***
## total.sulfur.dioxide -1.460e-03  3.621e-04  -4.032 5.61e-05 ***
## density              -1.062e+02  1.553e+01  -6.839 8.92e-12 ***
## pH                    5.037e-01  1.001e-01   5.033 5.00e-07 ***
## sulphates             7.110e-01  8.512e-02   8.354  < 2e-16 ***
## alcohol               2.247e-01  1.967e-02  11.425  < 2e-16 ***
## wine_typewhite       -4.035e-01  6.279e-02  -6.426 1.43e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7336 on 5184 degrees of freedom
## Multiple R-squared:  0.3021, Adjusted R-squared:  0.3005 
## F-statistic:   187 on 12 and 5184 DF,  p-value: < 2.2e-16

#Random Forest
library(randomForest)

## Warning: package 'randomForest' was built under R version 4.3.3

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

# Train model
rf_model <- randomForest(quality ~ ., data = train_data, ntree = 500, importance = TRUE)

# Predict
rf_pred <- predict(rf_model, newdata = test_data)

# Evaluate
rf_rmse <- sqrt(mean((rf_pred - test_data$quality)^2))

# Feature importance
importance(rf_model)

##                       %IncMSE IncNodePurity
## fixed.acidity        50.77460     225.13359
## volatile.acidity     81.59938     445.63538
## citric.acid          52.45018     274.29379
## residual.sugar       61.54419     281.68639
## chlorides            47.61695     297.51923
## free.sulfur.dioxide  77.14629     330.68267
## total.sulfur.dioxide 52.83166     284.70721
## density              42.08119     382.53707
## pH                   63.05055     254.71268
## sulphates            65.31117     277.63214
## alcohol              87.54968     758.43166
## wine_type            11.59997      11.39027

varImpPlot(rf_model)

##Binary SVR
library(e1071)

## Warning: package 'e1071' was built under R version 4.3.3

# 1. Create a binary target variable: high quality = 1, low = 0
wine_data$class_quality <- ifelse(wine_data$quality >= 7, 1, 0)
wine_data$class_quality <- as.factor(wine_data$class_quality)

# 2. Convert wine_type to numeric (if it's still a factor/char)
wine_data$wine_type <- as.numeric(as.factor(wine_data$wine_type))

# 3. Split data
set.seed(123)
sample_index <- sample(1:nrow(wine_data), 0.7 * nrow(wine_data))
train_data <- wine_data[sample_index, ]
test_data  <- wine_data[-sample_index, ]

# 4. Train SVM with linear kernel
svmfit <- svm(class_quality ~ . -quality, data = train_data, kernel = "linear", cost = 10, scale = TRUE)

# 5. Predict on test set
pred_svm <- predict(svmfit, newdata = test_data)

# 6. Confusion matrix
table(Predicted = pred_svm, Actual = test_data$class_quality)

##          Actual
## Predicted    0    1
##         0 1569  381
##         1    0    0

##Multi-Class SVR
wine_data$quality <- as.factor(wine_data$quality)

# Split data
set.seed(123)
sample_index <- sample(1:nrow(wine_data), 0.7 * nrow(wine_data))
train_data <- wine_data[sample_index, ]
test_data  <- wine_data[-sample_index, ]

# Train SVM
svmfit <- svm(quality ~ . -class_quality, data = train_data, kernel = "linear", cost = 10, scale = TRUE)

# Predict
pred_quality <- predict(svmfit, newdata = test_data)

# Confusion matrix
table(Predicted = pred_quality, Actual = test_data$quality)

##          Actual
## Predicted   3   4   5   6   7   8   9
##         3   0   0   0   0   0   0   0
##         4   0   0   0   0   0   0   0
##         5   1  35 376 219  28   3   0
##         6   6  19 243 670 293  54   3
##         7   0   0   0   0   0   0   0
##         8   0   0   0   0   0   0   0
##         9   0   0   0   0   0   0   0

#Support Vector Machines (SVM)
library(e1071)

# 1. Convert wine type (if not already numeric or factor)
wine_data$wine_type <- as.factor(wine_data$wine_type)

# 2. Convert quality to factor for classification
wine_data$quality <- as.factor(wine_data$quality)

# 3. Train/Test Split
set.seed(123)
train_idx <- sample(1:nrow(wine_data), 0.7 * nrow(wine_data))
train_data <- wine_data[train_idx, ]
test_data  <- wine_data[-train_idx, ]

# 4. Train Radial SVM
svm_rbf <- svm(quality ~ ., 
               data = train_data,
               kernel = "radial",      # non-linear kernel
               cost = 1,               # penalty for misclassification
               gamma = 0.1,            # controls the influence of a single training example
               scale = TRUE)

# 5. Predict on test data
pred_rbf <- predict(svm_rbf, newdata = test_data)

# 6. Evaluate performance
conf_mat <- table(Predicted = pred_rbf, Actual = test_data$quality)
print(conf_mat)

##          Actual
## Predicted   3   4   5   6   7   8   9
##         3   0   0   0   0   0   0   0
##         4   0   0   0   0   0   0   0
##         5   1  39 420 190   2   0   0
##         6   6  15 199 699   0   0   0
##         7   0   0   0   0 319  57   3
##         8   0   0   0   0   0   0   0
##         9   0   0   0   0   0   0   0

# Optional: Accuracy
accuracy <- sum(diag(conf_mat)) / sum(conf_mat)
cat("Accuracy:", round(accuracy, 4), "\n")

## Accuracy: 0.7374

Stats Final Project

Brianna Stopher

2025-04-14