DDS 6306 Wine Quality Project

Team: Gerden Clark, Solomon Matthew, Vanessa Nkongolo

Date: December 14, 2024

Scope: To develop an effective model for predicting wine quality based on collected data of wine chemistry, contracted by Robert Renzoni Vineyards in Riverside California. The performance of the model is based upon mean absolute error (MAE), the lower this value the better performing the model is. The contractor has also requested an output of feature importance to the prediction model. Additionally, a challenge has been set forth to submit a linear regression model with the lowest possible MAE for competitive purposes, this model does not need to be the chosen model for the client.

Libraries

# Load necessary libraries
library(caret)
library(corrplot)
library(dplyr)
library(ggcorrplot)
library(GGally)
library(ggplot2)
library(glmnet)
library(Metrics)
library(modeldata)
library(plotly)
library(randomForest)
library(readxl)
library(reshape2)
library(tidyverse)

Data Engineering

# Data is contained in CSV files called "Wine Train", "Wine Test", and "Wine Types And Locations" which are locally stored

train = read.csv("C:/Users/gerde/OneDrive/Documents/DS_6306_Materials/Final Project/Wine Train.csv")
test = read.csv("C:/Users/gerde/OneDrive/Documents/DS_6306_Materials/Final Project/Wine Test Set.csv")
wine_types_locations <- readxl::read_excel("C:/Users/gerde/OneDrive/Documents/DS_6306_Materials/Final Project/Wine Types And Locations.xlsx")

winetrain = merge(train, wine_types_locations, by = "ID")
winetest = merge(test, wine_types_locations, by = "ID")

# Change the incorrect spelling of Califormia to California for uniformity
winetrain$location <- gsub("Califormia", "California", winetrain$location)
winetest$location <- gsub("Califormia", "California", winetest$location)

# Inspect data structure
str(winetrain)
## 'data.frame':    5463 obs. of  15 variables:
##  $ ID                  : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ fixed.acidity       : num  7.2 6 6.9 6.6 7.1 6 7.2 6.8 9.1 7.8 ...
##  $ volatile.acidity    : num  0.34 0.27 0.26 0.25 0.17 0.29 0.57 0.45 0.27 0.32 ...
##  $ citric.acid         : num  0.34 0.28 0.49 0.34 0.43 0.25 0.06 0.3 0.32 0.33 ...
##  $ residual.sugar      : num  12.6 4.8 1.6 3 1.3 1.4 1.6 11.8 1.1 10.4 ...
##  $ chlorides           : num  0.048 0.063 0.058 0.054 0.023 0.033 0.076 0.094 0.031 0.031 ...
##  $ free.sulfur.dioxide : num  7 31 39 22 33 30 9 23 15 47 ...
##  $ total.sulfur.dioxide: num  41 201 166 141 132 114 27 97 151 194 ...
##  $ density             : num  0.994 0.996 0.997 0.993 0.991 ...
##  $ pH                  : num  3.19 3.69 3.65 3.26 3.11 3.08 3.36 3.09 3.03 3.07 ...
##  $ sulphates           : num  0.4 0.71 0.52 0.47 0.56 0.43 0.7 0.44 0.41 0.58 ...
##  $ alcohol             : num  11.7 10 9.4 10.4 11.7 13.2 9.6 9.6 10.6 9.6 ...
##  $ quality             : int  5 5 4 6 6 6 6 5 5 6 ...
##  $ type                : chr  "white" "white" "white" "white" ...
##  $ location            : chr  "Texas" "Texas" "Texas" "California" ...
str(winetest)
## 'data.frame':    1034 obs. of  14 variables:
##  $ ID                  : int  5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 ...
##  $ fixed.acidity       : num  6.6 7.2 8.9 6.7 7 7 7 5.7 6.6 7.8 ...
##  $ volatile.acidity    : num  0.84 0.54 0.565 0.13 0.57 0.17 0.18 0.36 0.19 0.43 ...
##  $ citric.acid         : num  0.03 0.27 0.34 0.32 0.02 0.31 0.49 0.34 0.28 0.49 ...
##  $ residual.sugar      : num  2.3 2.6 3 3.7 2 4.8 5.3 4.2 11.8 13 ...
##  $ chlorides           : num  0.059 0.084 0.093 0.017 0.072 0.034 0.04 0.026 0.042 0.033 ...
##  $ free.sulfur.dioxide : num  32 12 16 32 17 34 34 21 54 37 ...
##  $ total.sulfur.dioxide: num  48 78 112 99 26 132 125 77 137 158 ...
##  $ density             : num  0.995 0.996 1 0.993 0.996 ...
##  $ pH                  : num  3.52 3.39 3.38 3.12 3.36 3.36 3.24 3.41 3.18 3.14 ...
##  $ sulphates           : num  0.56 0.71 0.61 0.44 0.61 0.48 0.4 0.45 0.37 0.35 ...
##  $ alcohol             : num  12.3 11 9.5 10 10.2 9.6 12.2 11.9 10.8 11.3 ...
##  $ type                : chr  "red" "red" "red" "white" ...
##  $ location            : chr  "California" "Texas" "Texas" "California" ...
# Preview top 5 rows from each to verify data
head(winetrain,5)
##   ID fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1  1           7.2             0.34        0.34           12.6     0.048
## 2  2           6.0             0.27        0.28            4.8     0.063
## 3  3           6.9             0.26        0.49            1.6     0.058
## 4  4           6.6             0.25        0.34            3.0     0.054
## 5  5           7.1             0.17        0.43            1.3     0.023
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                   7                   41 0.99420 3.19      0.40    11.7
## 2                  31                  201 0.99640 3.69      0.71    10.0
## 3                  39                  166 0.99650 3.65      0.52     9.4
## 4                  22                  141 0.99338 3.26      0.47    10.4
## 5                  33                  132 0.99067 3.11      0.56    11.7
##   quality  type   location
## 1       5 white      Texas
## 2       5 white      Texas
## 3       4 white      Texas
## 4       6 white California
## 5       6 white California
head(winetest,5)
##     ID fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 5464           6.6            0.840        0.03            2.3     0.059
## 2 5465           7.2            0.540        0.27            2.6     0.084
## 3 5466           8.9            0.565        0.34            3.0     0.093
## 4 5467           6.7            0.130        0.32            3.7     0.017
## 5 5468           7.0            0.570        0.02            2.0     0.072
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol  type
## 1                  32                   48 0.99520 3.52      0.56    12.3   red
## 2                  12                   78 0.99640 3.39      0.71    11.0   red
## 3                  16                  112 0.99980 3.38      0.61     9.5   red
## 4                  32                   99 0.99348 3.12      0.44    10.0 white
## 5                  17                   26 0.99575 3.36      0.61    10.2   red
##     location
## 1 California
## 2      Texas
## 3      Texas
## 4 California
## 5      Texas
# Impute missing values for numeric columns with mean
winetrain = winetrain %>% mutate(across(where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))
winetest = winetest %>% mutate(across(where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))

# Handle missing 'type' with most frequent value
frequentMostTrain = names(sort(table(winetrain$type), decreasing = TRUE))[1]
frequentMostTest = names(sort(table(winetest$type), decreasing = TRUE))[1]
winetrain$type[is.na(winetrain$type)] = frequentMostTrain
winetest$type[is.na(winetest$type)] = frequentMostTest

# Remove difference in case for the "type" column
winetrain$type = tolower(winetrain$type)
winetest$type = tolower(winetest$type)

# Convert 'type' and 'location' to factors
winetrain$type = as.factor(winetrain$type)
winetrain$location = as.factor(winetrain$location)
winetest$type = as.factor(winetest$type)
winetest$location = as.factor(winetest$location)

Correlation Matrix

# Compute correlation matrix
correlation_matrix = cor(winetrain %>% select_if(is.numeric))

# Create a correlation plot using ggcorrplot
p = ggcorrplot(correlation_matrix, 
                hc.order = TRUE, 
                type = "lower", 
                lab = TRUE, 
                lab_size = 3, 
                method = "circle", 
                colors = c("tomato2", "white", "springgreen3"),
                title = "Correlation Heatmap",
                ggtheme = ggplot2::theme_minimal()) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Convert the ggplot object to an interactive plotly object

# First, melt the correlation matrix for easier data manipulation
melted_cormat = melt(correlation_matrix, na.rm = TRUE)

# Plot with plotly for interactivity
plotly_corr = plot_ly(
  x = melted_cormat$Var1,
  y = melted_cormat$Var2,
  z = melted_cormat$value,
  type = "heatmap",
  colorscale = "RdBu",
  showscale = TRUE,
  hoverinfo = "text"
) %>%
  layout(
    xaxis = list(title = ""),
    yaxis = list(title = ""),
    title = "Feature Correlation Matrix",
    font = list(size = 12)
  ) %>%
  colorbar(title = "Correlation", len = 0.5)

# Display the interactive plot
plotly_corr

Model Selection

The models we chose to try out are a polynomial regression model and a random forest model. Both models are commonly used for numerical data due to their ability to determine linear relationships of data.

Regression Model

# Feature Engineering for regression model
winetrainE = winetrain %>%
  mutate(
    type_white = ifelse(type == "white", 1, 0),
    type_unknown = ifelse(type == "unknown", 1, 0),
    location_Texas = ifelse(location == "Texas", 1, 0),
    alcohol_density = alcohol * density,
    alcohol_sulphates = alcohol * sulphates,
    alcohol_squared = alcohol^2,
    density_squared = density^2,
    log_residual_sugar = log(residual.sugar + 1),
    log_chlorides = log(chlorides + 1)
  )

# Apply same transformations to test set
winetestE = winetest %>%
  mutate(
    type_white = ifelse(type == "white", 1, 0),
    type_unknown = ifelse(type == "unknown", 1, 0),
    location_Texas = ifelse(location == "Texas", 1, 0),
    alcohol_density = alcohol * density,
    alcohol_sulphates = alcohol * sulphates,
    alcohol_squared = alcohol^2,
    density_squared = density^2,
    log_residual_sugar = log(residual.sugar + 1),
    log_chlorides = log(chlorides + 1)
  )

# Split data into train/test sets
set.seed(44)
splitPerc = 0.75
trainIndices = sample(1:dim(winetrainE)[1], round(splitPerc * dim(winetrainE)[1]))
train_data = winetrainE[trainIndices, ]
test_data = winetrainE[-trainIndices, ]

# Fit linear model
fit = lm(quality ~ ., data = train_data)
summary(fit)
## 
## Call:
## lm(formula = quality ~ ., data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.00735 -0.36607 -0.04942  0.38108  2.93704 
## 
## Coefficients: (3 not defined because of singularities)
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           3.460e+03  1.738e+03   1.990  0.04663 *  
## ID                    3.157e-06  6.721e-06   0.470  0.63856    
## fixed.acidity         8.562e-02  1.933e-02   4.430 9.65e-06 ***
## volatile.acidity     -1.201e+00  9.504e-02 -12.636  < 2e-16 ***
## citric.acid          -1.227e-02  9.411e-02  -0.130  0.89624    
## residual.sugar        1.577e-02  1.124e-02   1.403  0.16075    
## chlorides            -2.425e+00  5.054e+00  -0.480  0.63137    
## free.sulfur.dioxide   2.550e-03  9.048e-04   2.819  0.00484 ** 
## total.sulfur.dioxide -7.979e-04  3.697e-04  -2.158  0.03099 *  
## density              -6.939e+03  3.445e+03  -2.015  0.04401 *  
## pH                    6.051e-01  1.108e-01   5.463 4.97e-08 ***
## sulphates            -4.472e-01  7.375e-01  -0.606  0.54436    
## alcohol               1.030e+01  6.352e+00   1.622  0.10495    
## typewhite            -4.349e-01  6.487e-02  -6.705 2.29e-11 ***
## locationTexas        -5.915e-01  2.374e-02 -24.918  < 2e-16 ***
## type_white                   NA         NA      NA       NA    
## type_unknown                 NA         NA      NA       NA    
## location_Texas               NA         NA      NA       NA    
## alcohol_density      -1.063e+01  6.237e+00  -1.705  0.08834 .  
## alcohol_sulphates     1.026e-01  6.852e-02   1.497  0.13452    
## alcohol_squared       1.426e-02  1.081e-02   1.318  0.18748    
## density_squared       3.484e+03  1.707e+03   2.042  0.04125 *  
## log_residual_sugar    2.744e-01  6.464e-02   4.246 2.23e-05 ***
## log_chlorides         2.049e+00  5.903e+00   0.347  0.72846    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6768 on 4076 degrees of freedom
## Multiple R-squared:  0.3929, Adjusted R-squared:   0.39 
## F-statistic: 131.9 on 20 and 4076 DF,  p-value: < 2.2e-16
# Predictions and MAE
predictions = predict(fit, newdata = test_data)
actual = test_data$quality
mae = mean(abs(actual - predictions))
cat("Polynomial Regression Mean Absolute Error on Training Data:", mae, "\n")
## Polynomial Regression Mean Absolute Error on Training Data: 0.4943121
# Predict on test data using Polynomial Regression
test_preds_poly = predict(fit, winetestE)

Random Forest Model

# Split features and target variable for training
x_train = winetrain %>% select(-quality)
y_train = winetrain$quality

# Train a Random Forest model
set.seed(44)
rf_model = randomForest(x = x_train, y = y_train, 
                        ntree = 500, mtry = 4, importance = TRUE)

# Evaluate model on training data
train_preds = predict(rf_model, x_train)
mae_train = mae(as.numeric(y_train), as.numeric(train_preds)) 
cat("Random Forest Mean Absolute Error on Training Data:", mae_train, "\n")
## Random Forest Mean Absolute Error on Training Data: 0.1725617
# Predict on test data
test_preds = predict(rf_model, winetest)

Output Predictions

# Display mean of the predicted wine qualities for each model
mean(test_preds)
## [1] 5.801964
mean(test_preds_poly)
## [1] 5.822865
# Save predictions
test_results = test
test_results$Random_Forest_Predictions = test_preds
test_results$Polynomial_Linear_Regression_Predictions = test_preds_poly
tr_trimmed = test_results %>% select(ID,Random_Forest_Predictions,Polynomial_Linear_Regression_Predictions)

# Verify data is properly structured
str(tr_trimmed)
## 'data.frame':    1034 obs. of  3 variables:
##  $ ID                                      : int  5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 ...
##  $ Random_Forest_Predictions               : num  6.56 5.34 5.11 6.18 5.44 ...
##  $ Polynomial_Linear_Regression_Predictions: num  5.88 5.38 5.05 6.07 5.26 ...
# Write data to CSV file
write.csv(tr_trimmed, "C:/Users/gerde/OneDrive/Documents/DS_6306_Materials/Final Project/New_Wine_Test_Predictions.csv", row.names = FALSE)

Visualizations

# Enhanced relationship between alcohol and quality scatter plot
ggplot(train_data, aes(x = alcohol, y = quality)) +
  geom_jitter(width = 1, height = 1, alpha = 0.6, size = 1, color = "maroon") +  
  geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "dashed", linewidth = 1.5) +  
  scale_x_continuous(breaks = seq(8, 15, by = 1)) +  
  scale_y_continuous(breaks = seq(3, 9, by = 1)) +  
  labs(title = "Alcohol Content vs Wine Quality",
       subtitle = "Each point represents a wine sample",
       x = "Alcohol Content (%)", 
       y = "Quality Rating (1-10)",
       caption = "Source: Wine Dataset") +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    plot.subtitle = element_text(size = 10),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10),
    plot.caption = element_text(size = 8, color = "gray50"),
    panel.grid.minor = element_blank()  
  )
## `geom_smooth()` using formula = 'y ~ x'

# Stacked bar plot of wine type by location with custom colors
wine_counts <- winetrain %>%
  group_by(location, type) %>%
  summarise(count = n()) %>%
  mutate(percentage = count / sum(count)) %>%
  ungroup()
## `summarise()` has grouped output by 'location'. You can override using the
## `.groups` argument.
ggplot(wine_counts, aes(x = location, y = percentage, fill = type)) +
  geom_bar(stat = "identity", position = "fill") +
  geom_text(aes(label = scales::percent(percentage, accuracy = 1)),
            position = position_stack(vjust = 0.5),
            size = 3) +
  scale_fill_manual(values = c("red" = "maroon", "white" = "white")) +
  labs(title = "Distribution of Wine Type by Location",
       x = "Location",
       y = "Proportion",
       fill = "Wine Type") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "gray"))

# Density plot of wine quality split by location
ggplot(winetrain, aes(x = quality, fill = location)) +
  geom_density(alpha = 0.5) +  # Semi-transparent fill for visibility
  facet_wrap(~location, scales = "free_y") +  # Create panels for each location
  scale_fill_brewer(palette = "Set1") +  # Use a color palette for distinction
  labs(title = "Distribution of Wine Quality by Location",
       subtitle = "Density of quality ratings among wines",
       x = "Wine Quality Rating (1-10)",
       y = "Density") +
  
  theme(
    plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(size = 12, hjust = 0.5),
    axis.title = element_text(size = 14, face = "bold"),
    axis.text = element_text(size = 10),
    strip.text = element_text(size = 12),  # Size for facet labels
    legend.position = "none",  # Remove legend if redundant with facets
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    plot.background = element_rect(fill = "#F0F0F0", color = NA),
    axis.line = element_line(colour = "black")
  )

# Enhanced bar plot of correlations including negative values

# Compute correlations with quality
correlations <- winetrain %>%
  select(-type, -location) %>%
  summarise(across(where(is.numeric), ~ cor(., winetrain$quality))) %>%
  pivot_longer(cols = everything(), names_to = "Predictor", values_to = "Correlation")

# Sort by correlation
correlations <- correlations %>% arrange(Correlation)


ggplot(correlations, aes(x = reorder(Predictor, Correlation), y = Correlation)) +
  geom_bar(stat = "identity", 
           aes(fill = ifelse(Correlation > 0, "#8B0000", "pink")), 
           color = "black", width = 0.7) +  
  geom_text(aes(label = round(Correlation, 2)), 
            vjust = ifelse(correlations$Correlation > 0, -0.5, 1.5), 
            hjust = ifelse(correlations$Correlation > 0, -0.2, 1.2), 
            size = 3, color = "black") +  
  coord_flip() +
  scale_y_continuous(limits = c(min(correlations$Correlation) * 1.1, max(correlations$Correlation) * 1.1), 
                     expand = c(0, 0)) +  
  labs(title = "Correlation of Predictors with Wine Quality",
       subtitle = "Sorted by absolute strength of correlation",
       x = "Predictors",
       y = "Correlation Coefficient") +
  scale_fill_identity(guide = "legend", labels = c("Positive","Negative")) +  
  
  theme(
    plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(size = 12, hjust = 0.5),
    axis.title = element_text(size = 14, face = "bold"),
    axis.text = element_text(size = 12),
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(),
    plot.background = element_rect(fill = "#F0F0F0", color = NA),
    axis.line = element_line(colour = "black"),
    legend.title = element_blank(),
    legend.position = "bottom",
    plot.margin = unit(c(1, 1, 1, 1), "cm")
  )

Conclusions

Based on the MAE values obtained from the random forest model and polynomial regression model, the best overall model based upon MAE is the random forest model.

Alcohol is the most significant contributing feature used for predicting quality of wine, higher alcohol content equates to higher quality.

Despite the difference in the Texas vs. California wine quality density plots, this feature had no significance in the models.