Data Engineering
# Data is contained in CSV files called "Wine Train", "Wine Test", and "Wine Types And Locations" which are locally stored
train = read.csv("C:/Users/gerde/OneDrive/Documents/DS_6306_Materials/Final Project/Wine Train.csv")
test = read.csv("C:/Users/gerde/OneDrive/Documents/DS_6306_Materials/Final Project/Wine Test Set.csv")
wine_types_locations <- readxl::read_excel("C:/Users/gerde/OneDrive/Documents/DS_6306_Materials/Final Project/Wine Types And Locations.xlsx")
winetrain = merge(train, wine_types_locations, by = "ID")
winetest = merge(test, wine_types_locations, by = "ID")
# Change the incorrect spelling of Califormia to California for uniformity
winetrain$location <- gsub("Califormia", "California", winetrain$location)
winetest$location <- gsub("Califormia", "California", winetest$location)
# Inspect data structure
str(winetrain)
## 'data.frame': 5463 obs. of 15 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ fixed.acidity : num 7.2 6 6.9 6.6 7.1 6 7.2 6.8 9.1 7.8 ...
## $ volatile.acidity : num 0.34 0.27 0.26 0.25 0.17 0.29 0.57 0.45 0.27 0.32 ...
## $ citric.acid : num 0.34 0.28 0.49 0.34 0.43 0.25 0.06 0.3 0.32 0.33 ...
## $ residual.sugar : num 12.6 4.8 1.6 3 1.3 1.4 1.6 11.8 1.1 10.4 ...
## $ chlorides : num 0.048 0.063 0.058 0.054 0.023 0.033 0.076 0.094 0.031 0.031 ...
## $ free.sulfur.dioxide : num 7 31 39 22 33 30 9 23 15 47 ...
## $ total.sulfur.dioxide: num 41 201 166 141 132 114 27 97 151 194 ...
## $ density : num 0.994 0.996 0.997 0.993 0.991 ...
## $ pH : num 3.19 3.69 3.65 3.26 3.11 3.08 3.36 3.09 3.03 3.07 ...
## $ sulphates : num 0.4 0.71 0.52 0.47 0.56 0.43 0.7 0.44 0.41 0.58 ...
## $ alcohol : num 11.7 10 9.4 10.4 11.7 13.2 9.6 9.6 10.6 9.6 ...
## $ quality : int 5 5 4 6 6 6 6 5 5 6 ...
## $ type : chr "white" "white" "white" "white" ...
## $ location : chr "Texas" "Texas" "Texas" "California" ...
str(winetest)
## 'data.frame': 1034 obs. of 14 variables:
## $ ID : int 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 ...
## $ fixed.acidity : num 6.6 7.2 8.9 6.7 7 7 7 5.7 6.6 7.8 ...
## $ volatile.acidity : num 0.84 0.54 0.565 0.13 0.57 0.17 0.18 0.36 0.19 0.43 ...
## $ citric.acid : num 0.03 0.27 0.34 0.32 0.02 0.31 0.49 0.34 0.28 0.49 ...
## $ residual.sugar : num 2.3 2.6 3 3.7 2 4.8 5.3 4.2 11.8 13 ...
## $ chlorides : num 0.059 0.084 0.093 0.017 0.072 0.034 0.04 0.026 0.042 0.033 ...
## $ free.sulfur.dioxide : num 32 12 16 32 17 34 34 21 54 37 ...
## $ total.sulfur.dioxide: num 48 78 112 99 26 132 125 77 137 158 ...
## $ density : num 0.995 0.996 1 0.993 0.996 ...
## $ pH : num 3.52 3.39 3.38 3.12 3.36 3.36 3.24 3.41 3.18 3.14 ...
## $ sulphates : num 0.56 0.71 0.61 0.44 0.61 0.48 0.4 0.45 0.37 0.35 ...
## $ alcohol : num 12.3 11 9.5 10 10.2 9.6 12.2 11.9 10.8 11.3 ...
## $ type : chr "red" "red" "red" "white" ...
## $ location : chr "California" "Texas" "Texas" "California" ...
# Preview top 5 rows from each to verify data
head(winetrain,5)
## ID fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 1 7.2 0.34 0.34 12.6 0.048
## 2 2 6.0 0.27 0.28 4.8 0.063
## 3 3 6.9 0.26 0.49 1.6 0.058
## 4 4 6.6 0.25 0.34 3.0 0.054
## 5 5 7.1 0.17 0.43 1.3 0.023
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 7 41 0.99420 3.19 0.40 11.7
## 2 31 201 0.99640 3.69 0.71 10.0
## 3 39 166 0.99650 3.65 0.52 9.4
## 4 22 141 0.99338 3.26 0.47 10.4
## 5 33 132 0.99067 3.11 0.56 11.7
## quality type location
## 1 5 white Texas
## 2 5 white Texas
## 3 4 white Texas
## 4 6 white California
## 5 6 white California
head(winetest,5)
## ID fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 5464 6.6 0.840 0.03 2.3 0.059
## 2 5465 7.2 0.540 0.27 2.6 0.084
## 3 5466 8.9 0.565 0.34 3.0 0.093
## 4 5467 6.7 0.130 0.32 3.7 0.017
## 5 5468 7.0 0.570 0.02 2.0 0.072
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol type
## 1 32 48 0.99520 3.52 0.56 12.3 red
## 2 12 78 0.99640 3.39 0.71 11.0 red
## 3 16 112 0.99980 3.38 0.61 9.5 red
## 4 32 99 0.99348 3.12 0.44 10.0 white
## 5 17 26 0.99575 3.36 0.61 10.2 red
## location
## 1 California
## 2 Texas
## 3 Texas
## 4 California
## 5 Texas
# Impute missing values for numeric columns with mean
winetrain = winetrain %>% mutate(across(where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))
winetest = winetest %>% mutate(across(where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))
# Handle missing 'type' with most frequent value
frequentMostTrain = names(sort(table(winetrain$type), decreasing = TRUE))[1]
frequentMostTest = names(sort(table(winetest$type), decreasing = TRUE))[1]
winetrain$type[is.na(winetrain$type)] = frequentMostTrain
winetest$type[is.na(winetest$type)] = frequentMostTest
# Remove difference in case for the "type" column
winetrain$type = tolower(winetrain$type)
winetest$type = tolower(winetest$type)
# Convert 'type' and 'location' to factors
winetrain$type = as.factor(winetrain$type)
winetrain$location = as.factor(winetrain$location)
winetest$type = as.factor(winetest$type)
winetest$location = as.factor(winetest$location)
Correlation Matrix
# Compute correlation matrix
correlation_matrix = cor(winetrain %>% select_if(is.numeric))
# Create a correlation plot using ggcorrplot
p = ggcorrplot(correlation_matrix,
hc.order = TRUE,
type = "lower",
lab = TRUE,
lab_size = 3,
method = "circle",
colors = c("tomato2", "white", "springgreen3"),
title = "Correlation Heatmap",
ggtheme = ggplot2::theme_minimal()) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Convert the ggplot object to an interactive plotly object
# First, melt the correlation matrix for easier data manipulation
melted_cormat = melt(correlation_matrix, na.rm = TRUE)
# Plot with plotly for interactivity
plotly_corr = plot_ly(
x = melted_cormat$Var1,
y = melted_cormat$Var2,
z = melted_cormat$value,
type = "heatmap",
colorscale = "RdBu",
showscale = TRUE,
hoverinfo = "text"
) %>%
layout(
xaxis = list(title = ""),
yaxis = list(title = ""),
title = "Feature Correlation Matrix",
font = list(size = 12)
) %>%
colorbar(title = "Correlation", len = 0.5)
# Display the interactive plot
plotly_corr
Model Selection
The models we chose to try out are a polynomial regression model and
a random forest model. Both models are commonly used for numerical data
due to their ability to determine linear relationships of data.
Regression Model
# Feature Engineering for regression model
winetrainE = winetrain %>%
mutate(
type_white = ifelse(type == "white", 1, 0),
type_unknown = ifelse(type == "unknown", 1, 0),
location_Texas = ifelse(location == "Texas", 1, 0),
alcohol_density = alcohol * density,
alcohol_sulphates = alcohol * sulphates,
alcohol_squared = alcohol^2,
density_squared = density^2,
log_residual_sugar = log(residual.sugar + 1),
log_chlorides = log(chlorides + 1)
)
# Apply same transformations to test set
winetestE = winetest %>%
mutate(
type_white = ifelse(type == "white", 1, 0),
type_unknown = ifelse(type == "unknown", 1, 0),
location_Texas = ifelse(location == "Texas", 1, 0),
alcohol_density = alcohol * density,
alcohol_sulphates = alcohol * sulphates,
alcohol_squared = alcohol^2,
density_squared = density^2,
log_residual_sugar = log(residual.sugar + 1),
log_chlorides = log(chlorides + 1)
)
# Split data into train/test sets
set.seed(44)
splitPerc = 0.75
trainIndices = sample(1:dim(winetrainE)[1], round(splitPerc * dim(winetrainE)[1]))
train_data = winetrainE[trainIndices, ]
test_data = winetrainE[-trainIndices, ]
# Fit linear model
fit = lm(quality ~ ., data = train_data)
summary(fit)
##
## Call:
## lm(formula = quality ~ ., data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.00735 -0.36607 -0.04942 0.38108 2.93704
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.460e+03 1.738e+03 1.990 0.04663 *
## ID 3.157e-06 6.721e-06 0.470 0.63856
## fixed.acidity 8.562e-02 1.933e-02 4.430 9.65e-06 ***
## volatile.acidity -1.201e+00 9.504e-02 -12.636 < 2e-16 ***
## citric.acid -1.227e-02 9.411e-02 -0.130 0.89624
## residual.sugar 1.577e-02 1.124e-02 1.403 0.16075
## chlorides -2.425e+00 5.054e+00 -0.480 0.63137
## free.sulfur.dioxide 2.550e-03 9.048e-04 2.819 0.00484 **
## total.sulfur.dioxide -7.979e-04 3.697e-04 -2.158 0.03099 *
## density -6.939e+03 3.445e+03 -2.015 0.04401 *
## pH 6.051e-01 1.108e-01 5.463 4.97e-08 ***
## sulphates -4.472e-01 7.375e-01 -0.606 0.54436
## alcohol 1.030e+01 6.352e+00 1.622 0.10495
## typewhite -4.349e-01 6.487e-02 -6.705 2.29e-11 ***
## locationTexas -5.915e-01 2.374e-02 -24.918 < 2e-16 ***
## type_white NA NA NA NA
## type_unknown NA NA NA NA
## location_Texas NA NA NA NA
## alcohol_density -1.063e+01 6.237e+00 -1.705 0.08834 .
## alcohol_sulphates 1.026e-01 6.852e-02 1.497 0.13452
## alcohol_squared 1.426e-02 1.081e-02 1.318 0.18748
## density_squared 3.484e+03 1.707e+03 2.042 0.04125 *
## log_residual_sugar 2.744e-01 6.464e-02 4.246 2.23e-05 ***
## log_chlorides 2.049e+00 5.903e+00 0.347 0.72846
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6768 on 4076 degrees of freedom
## Multiple R-squared: 0.3929, Adjusted R-squared: 0.39
## F-statistic: 131.9 on 20 and 4076 DF, p-value: < 2.2e-16
# Predictions and MAE
predictions = predict(fit, newdata = test_data)
actual = test_data$quality
mae = mean(abs(actual - predictions))
cat("Polynomial Regression Mean Absolute Error on Training Data:", mae, "\n")
## Polynomial Regression Mean Absolute Error on Training Data: 0.4943121
# Predict on test data using Polynomial Regression
test_preds_poly = predict(fit, winetestE)
Random Forest Model
# Split features and target variable for training
x_train = winetrain %>% select(-quality)
y_train = winetrain$quality
# Train a Random Forest model
set.seed(44)
rf_model = randomForest(x = x_train, y = y_train,
ntree = 500, mtry = 4, importance = TRUE)
# Evaluate model on training data
train_preds = predict(rf_model, x_train)
mae_train = mae(as.numeric(y_train), as.numeric(train_preds))
cat("Random Forest Mean Absolute Error on Training Data:", mae_train, "\n")
## Random Forest Mean Absolute Error on Training Data: 0.1725617
# Predict on test data
test_preds = predict(rf_model, winetest)
Visualizations
# Enhanced relationship between alcohol and quality scatter plot
ggplot(train_data, aes(x = alcohol, y = quality)) +
geom_jitter(width = 1, height = 1, alpha = 0.6, size = 1, color = "maroon") +
geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "dashed", linewidth = 1.5) +
scale_x_continuous(breaks = seq(8, 15, by = 1)) +
scale_y_continuous(breaks = seq(3, 9, by = 1)) +
labs(title = "Alcohol Content vs Wine Quality",
subtitle = "Each point represents a wine sample",
x = "Alcohol Content (%)",
y = "Quality Rating (1-10)",
caption = "Source: Wine Dataset") +
theme(
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = element_text(size = 10),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10),
plot.caption = element_text(size = 8, color = "gray50"),
panel.grid.minor = element_blank()
)
## `geom_smooth()` using formula = 'y ~ x'

# Stacked bar plot of wine type by location with custom colors
wine_counts <- winetrain %>%
group_by(location, type) %>%
summarise(count = n()) %>%
mutate(percentage = count / sum(count)) %>%
ungroup()
## `summarise()` has grouped output by 'location'. You can override using the
## `.groups` argument.
ggplot(wine_counts, aes(x = location, y = percentage, fill = type)) +
geom_bar(stat = "identity", position = "fill") +
geom_text(aes(label = scales::percent(percentage, accuracy = 1)),
position = position_stack(vjust = 0.5),
size = 3) +
scale_fill_manual(values = c("red" = "maroon", "white" = "white")) +
labs(title = "Distribution of Wine Type by Location",
x = "Location",
y = "Proportion",
fill = "Wine Type") +
theme_minimal() +
theme(panel.background = element_rect(fill = "gray"))

# Density plot of wine quality split by location
ggplot(winetrain, aes(x = quality, fill = location)) +
geom_density(alpha = 0.5) + # Semi-transparent fill for visibility
facet_wrap(~location, scales = "free_y") + # Create panels for each location
scale_fill_brewer(palette = "Set1") + # Use a color palette for distinction
labs(title = "Distribution of Wine Quality by Location",
subtitle = "Density of quality ratings among wines",
x = "Wine Quality Rating (1-10)",
y = "Density") +
theme(
plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 12, hjust = 0.5),
axis.title = element_text(size = 14, face = "bold"),
axis.text = element_text(size = 10),
strip.text = element_text(size = 12), # Size for facet labels
legend.position = "none", # Remove legend if redundant with facets
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = "#F0F0F0", color = NA),
axis.line = element_line(colour = "black")
)

# Enhanced bar plot of correlations including negative values
# Compute correlations with quality
correlations <- winetrain %>%
select(-type, -location) %>%
summarise(across(where(is.numeric), ~ cor(., winetrain$quality))) %>%
pivot_longer(cols = everything(), names_to = "Predictor", values_to = "Correlation")
# Sort by correlation
correlations <- correlations %>% arrange(Correlation)
ggplot(correlations, aes(x = reorder(Predictor, Correlation), y = Correlation)) +
geom_bar(stat = "identity",
aes(fill = ifelse(Correlation > 0, "#8B0000", "pink")),
color = "black", width = 0.7) +
geom_text(aes(label = round(Correlation, 2)),
vjust = ifelse(correlations$Correlation > 0, -0.5, 1.5),
hjust = ifelse(correlations$Correlation > 0, -0.2, 1.2),
size = 3, color = "black") +
coord_flip() +
scale_y_continuous(limits = c(min(correlations$Correlation) * 1.1, max(correlations$Correlation) * 1.1),
expand = c(0, 0)) +
labs(title = "Correlation of Predictors with Wine Quality",
subtitle = "Sorted by absolute strength of correlation",
x = "Predictors",
y = "Correlation Coefficient") +
scale_fill_identity(guide = "legend", labels = c("Positive","Negative")) +
theme(
plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 12, hjust = 0.5),
axis.title = element_text(size = 14, face = "bold"),
axis.text = element_text(size = 12),
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = "#F0F0F0", color = NA),
axis.line = element_line(colour = "black"),
legend.title = element_blank(),
legend.position = "bottom",
plot.margin = unit(c(1, 1, 1, 1), "cm")
)
