## Homework 6: Support Vector Machines
# load data
data_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
wine_data <- read.csv(data_url, sep = ";")
# inspect the data
glimpse(wine_data)
## Rows: 4,898
## Columns: 12
## $ fixed.acidity <dbl> 7.0, 6.3, 8.1, 7.2, 7.2, 8.1, 6.2, 7.0, 6.3, 8.1,…
## $ volatile.acidity <dbl> 0.27, 0.30, 0.28, 0.23, 0.23, 0.28, 0.32, 0.27, 0…
## $ citric.acid <dbl> 0.36, 0.34, 0.40, 0.32, 0.32, 0.40, 0.16, 0.36, 0…
## $ residual.sugar <dbl> 20.70, 1.60, 6.90, 8.50, 8.50, 6.90, 7.00, 20.70,…
## $ chlorides <dbl> 0.045, 0.049, 0.050, 0.058, 0.058, 0.050, 0.045, …
## $ free.sulfur.dioxide <dbl> 45, 14, 30, 47, 47, 30, 30, 45, 14, 28, 11, 17, 1…
## $ total.sulfur.dioxide <dbl> 170, 132, 97, 186, 186, 97, 136, 170, 132, 129, 6…
## $ density <dbl> 1.0010, 0.9940, 0.9951, 0.9956, 0.9956, 0.9951, 0…
## $ pH <dbl> 3.00, 3.30, 3.26, 3.19, 3.19, 3.26, 3.18, 3.00, 3…
## $ sulphates <dbl> 0.45, 0.49, 0.44, 0.40, 0.40, 0.44, 0.47, 0.45, 0…
## $ alcohol <dbl> 8.8, 9.5, 10.1, 9.9, 9.9, 10.1, 9.6, 8.8, 9.5, 11…
## $ quality <int> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 7, 5, 7, 6…
# check for missing data
colSums(is.na(wine_data))
## fixed.acidity volatile.acidity citric.acid
## 0 0 0
## residual.sugar chlorides free.sulfur.dioxide
## 0 0 0
## total.sulfur.dioxide density pH
## 0 0 0
## sulphates alcohol quality
## 0 0 0
# convert quality variable to a factor
wine_data$quality <- as.factor(wine_data$quality)
# inspect preprocessed data
glimpse(wine_data)
## Rows: 4,898
## Columns: 12
## $ fixed.acidity <dbl> 7.0, 6.3, 8.1, 7.2, 7.2, 8.1, 6.2, 7.0, 6.3, 8.1,…
## $ volatile.acidity <dbl> 0.27, 0.30, 0.28, 0.23, 0.23, 0.28, 0.32, 0.27, 0…
## $ citric.acid <dbl> 0.36, 0.34, 0.40, 0.32, 0.32, 0.40, 0.16, 0.36, 0…
## $ residual.sugar <dbl> 20.70, 1.60, 6.90, 8.50, 8.50, 6.90, 7.00, 20.70,…
## $ chlorides <dbl> 0.045, 0.049, 0.050, 0.058, 0.058, 0.050, 0.045, …
## $ free.sulfur.dioxide <dbl> 45, 14, 30, 47, 47, 30, 30, 45, 14, 28, 11, 17, 1…
## $ total.sulfur.dioxide <dbl> 170, 132, 97, 186, 186, 97, 136, 170, 132, 129, 6…
## $ density <dbl> 1.0010, 0.9940, 0.9951, 0.9956, 0.9956, 0.9951, 0…
## $ pH <dbl> 3.00, 3.30, 3.26, 3.19, 3.19, 3.26, 3.18, 3.00, 3…
## $ sulphates <dbl> 0.45, 0.49, 0.44, 0.40, 0.40, 0.44, 0.47, 0.45, 0…
## $ alcohol <dbl> 8.8, 9.5, 10.1, 9.9, 9.9, 10.1, 9.6, 8.8, 9.5, 11…
## $ quality <fct> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 7, 5, 7, 6…
# set seed
set.seed(123)
# build the SVM model
svm_model <- svm(quality ~ ., data = wine_data, kernel = "linear", scale = TRUE)
# print the model
print(svm_model)
##
## Call:
## svm(formula = quality ~ ., data = wine_data, kernel = "linear", scale = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 4553
# predict the quality on the training data
predictions <- predict(svm_model, wine_data)
# confusion matrix
confusion_matrix <- table(Predicted = predictions, Actual = wine_data$quality)
confusion_matrix
## Actual
## Predicted 3 4 5 6 7 8 9
## 3 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0
## 5 8 102 779 423 43 1 0
## 6 12 61 678 1775 837 174 5
## 7 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0
# calculate accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
accuracy
## [1] 0.5214373
# create a grid of values for plotting decision boundaries
make.grid <- function(data, n = 75) {
x1 <- seq(from = min(wine_data$alcohol), to = max(wine_data$alcohol), length = n)
x2 <- seq(from = min(wine_data$pH), to = max(wine_data$pH), length = n)
expand.grid(alcohol = x1, pH = x2)
}
# assign mean values for other features
grid <- make.grid(wine_data)
predictor_means <- colMeans(wine_data[, setdiff(names(wine_data), "quality")])
for (var in names(predictor_means)) {
if (!(var %in% names(grid))) {
grid[[var]] <- predictor_means[[var]]
}
}
# predict on the grid
grid$quality <- predict(svm_model, newdata = grid)
# plot the decision boundaries
ggplot() +
geom_point(data = wine_data, aes(x = alcohol, y = pH, color = quality)) +
geom_tile(data = grid, aes(x = alcohol, y = pH, fill = quality), alpha = 0.3) +
ggtitle("SVM Decision Boundaries and Support Vectors") +
theme_minimal()