## Homework 6: Support Vector Machines

Loading the Data

# load data
data_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
wine_data <- read.csv(data_url, sep = ";")

# inspect the data
glimpse(wine_data)
## Rows: 4,898
## Columns: 12
## $ fixed.acidity        <dbl> 7.0, 6.3, 8.1, 7.2, 7.2, 8.1, 6.2, 7.0, 6.3, 8.1,…
## $ volatile.acidity     <dbl> 0.27, 0.30, 0.28, 0.23, 0.23, 0.28, 0.32, 0.27, 0…
## $ citric.acid          <dbl> 0.36, 0.34, 0.40, 0.32, 0.32, 0.40, 0.16, 0.36, 0…
## $ residual.sugar       <dbl> 20.70, 1.60, 6.90, 8.50, 8.50, 6.90, 7.00, 20.70,…
## $ chlorides            <dbl> 0.045, 0.049, 0.050, 0.058, 0.058, 0.050, 0.045, …
## $ free.sulfur.dioxide  <dbl> 45, 14, 30, 47, 47, 30, 30, 45, 14, 28, 11, 17, 1…
## $ total.sulfur.dioxide <dbl> 170, 132, 97, 186, 186, 97, 136, 170, 132, 129, 6…
## $ density              <dbl> 1.0010, 0.9940, 0.9951, 0.9956, 0.9956, 0.9951, 0…
## $ pH                   <dbl> 3.00, 3.30, 3.26, 3.19, 3.19, 3.26, 3.18, 3.00, 3…
## $ sulphates            <dbl> 0.45, 0.49, 0.44, 0.40, 0.40, 0.44, 0.47, 0.45, 0…
## $ alcohol              <dbl> 8.8, 9.5, 10.1, 9.9, 9.9, 10.1, 9.6, 8.8, 9.5, 11…
## $ quality              <int> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 7, 5, 7, 6…

Data Preprocessing

# check for missing data
colSums(is.na(wine_data))
##        fixed.acidity     volatile.acidity          citric.acid 
##                    0                    0                    0 
##       residual.sugar            chlorides  free.sulfur.dioxide 
##                    0                    0                    0 
## total.sulfur.dioxide              density                   pH 
##                    0                    0                    0 
##            sulphates              alcohol              quality 
##                    0                    0                    0
# convert quality variable to a factor
wine_data$quality <- as.factor(wine_data$quality)

# inspect preprocessed data
glimpse(wine_data)
## Rows: 4,898
## Columns: 12
## $ fixed.acidity        <dbl> 7.0, 6.3, 8.1, 7.2, 7.2, 8.1, 6.2, 7.0, 6.3, 8.1,…
## $ volatile.acidity     <dbl> 0.27, 0.30, 0.28, 0.23, 0.23, 0.28, 0.32, 0.27, 0…
## $ citric.acid          <dbl> 0.36, 0.34, 0.40, 0.32, 0.32, 0.40, 0.16, 0.36, 0…
## $ residual.sugar       <dbl> 20.70, 1.60, 6.90, 8.50, 8.50, 6.90, 7.00, 20.70,…
## $ chlorides            <dbl> 0.045, 0.049, 0.050, 0.058, 0.058, 0.050, 0.045, …
## $ free.sulfur.dioxide  <dbl> 45, 14, 30, 47, 47, 30, 30, 45, 14, 28, 11, 17, 1…
## $ total.sulfur.dioxide <dbl> 170, 132, 97, 186, 186, 97, 136, 170, 132, 129, 6…
## $ density              <dbl> 1.0010, 0.9940, 0.9951, 0.9956, 0.9956, 0.9951, 0…
## $ pH                   <dbl> 3.00, 3.30, 3.26, 3.19, 3.19, 3.26, 3.18, 3.00, 3…
## $ sulphates            <dbl> 0.45, 0.49, 0.44, 0.40, 0.40, 0.44, 0.47, 0.45, 0…
## $ alcohol              <dbl> 8.8, 9.5, 10.1, 9.9, 9.9, 10.1, 9.6, 8.8, 9.5, 11…
## $ quality              <fct> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 7, 5, 7, 6…

Building the SVM Model

# set seed
set.seed(123)

# build the SVM model
svm_model <- svm(quality ~ ., data = wine_data, kernel = "linear", scale = TRUE)

# print the model
print(svm_model)
## 
## Call:
## svm(formula = quality ~ ., data = wine_data, kernel = "linear", scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  4553

Model Evaluation

# predict the quality on the training data
predictions <- predict(svm_model, wine_data)

# confusion matrix
confusion_matrix <- table(Predicted = predictions, Actual = wine_data$quality)
confusion_matrix
##          Actual
## Predicted    3    4    5    6    7    8    9
##         3    0    0    0    0    0    0    0
##         4    0    0    0    0    0    0    0
##         5    8  102  779  423   43    1    0
##         6   12   61  678 1775  837  174    5
##         7    0    0    0    0    0    0    0
##         8    0    0    0    0    0    0    0
##         9    0    0    0    0    0    0    0
# calculate accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
accuracy
## [1] 0.5214373

Visualization

# create a grid of values for plotting decision boundaries
make.grid <- function(data, n = 75) {
  x1 <- seq(from = min(wine_data$alcohol), to = max(wine_data$alcohol), length = n)
  x2 <- seq(from = min(wine_data$pH), to = max(wine_data$pH), length = n)
  expand.grid(alcohol = x1, pH = x2)
}

# assign mean values for other features
grid <- make.grid(wine_data)
predictor_means <- colMeans(wine_data[, setdiff(names(wine_data), "quality")])
for (var in names(predictor_means)) {
  if (!(var %in% names(grid))) {
    grid[[var]] <- predictor_means[[var]]
  }
}

# predict on the grid
grid$quality <- predict(svm_model, newdata = grid)

# plot the decision boundaries
ggplot() +
  geom_point(data = wine_data, aes(x = alcohol, y = pH, color = quality)) +
  geom_tile(data = grid, aes(x = alcohol, y = pH, fill = quality), alpha = 0.3) +
  ggtitle("SVM Decision Boundaries and Support Vectors") +
  theme_minimal()