Wine Quality Prediction

Data and Library setup

library(e1071)

## Warning: package 'e1071' was built under R version 4.1.3

set.seed(123)

WQ_Train <- read.csv('/Users/S-Wri/Documents/MSc Data Science/Data Analytics with R/Data Analytics using R/DAR Coursework/WineQuality_Training.txt',header = TRUE)
WQ_Test <- read.csv('/Users/S-Wri/Documents/MSc Data Science/Data Analytics with R/Data Analytics using R/DAR Coursework/WineQuality_Testing.txt',header = TRUE)

WQ_Train$quality <- ifelse(WQ_Train$quality == 'Good', 1,0)
WQ_Test$quality <- ifelse(WQ_Test$quality == 'Good', 1,0)
WQ_Train$quality <- as.factor(WQ_Train$quality)
WQ_Test$quality <- as.factor(WQ_Test$quality)

Linear Kernal

Conduct a grid-search to find the optimal parameter of svm by using the linear kernal (cost=c(0.1,1,10)).

costs <- c(0.1,1,10)

svmfit1 <- svm(formula = quality ~ . ,
               data = WQ_Train ,
               kernel = 'linear' ,
               cost = costs ,
               cross = 5)

tuned1 <- tune(svm, 
              quality ~ ., 
              data = WQ_Train, 
              kernel = "linear", 
              range = costs, 
              cross = 5)

summary(svmfit1)

## 
## Call:
## svm(formula = quality ~ ., data = WQ_Train, kernel = "linear", cost = costs, 
##     cross = 5)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.1 1 10 
## 
## Number of Support Vectors:  1445
## 
##  ( 721 724 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
## 
## 5-fold cross-validation on training data:
## 
## Total Accuracy: 75.36 
## Single Accuracies:
##  74.4 78.4 72.8 73.8 77.4

tuned1$best.parameters

##   Var1
## 1  0.1

Train a svm classifier by using the linear kernal and the corresponding optimal parameter, then make predictions on the testing dataset, report the predictive performance.

svmfit2 <- svm(formula = quality ~ . ,
               data = WQ_Train , 
               type = 'C-classification' ,
               kernel = 'linear' , 
               cost = 0.1)

svm_pred1 <- predict(svmfit2, WQ_Test)

conf_mat1 <- table(svm_pred1, WQ_Test$quality)

accuracy <- sum(diag(conf_mat1)) / sum(conf_mat1)

conf_mat1

##          
## svm_pred1   0   1
##         0 172  86
##         1  40 202

print(paste0("Accuracy: ", round(accuracy, 3)))

## [1] "Accuracy: 0.748"

The predictive accuracy using the optimal cost parameter of 0.1 found in the grid search is 74.8%

RBF Kernel

Conduct a grid-search to find the optimal parameters of svm by using the RBF kernal (cost=c(0.1,1,10)and gamma=c(0.1,0.5,1.0)).

costs <- c(0.1,1,10)
gammas <- c(0.1,0.5,1.0)

svmfit3 <- svm(formula = quality ~ .,
               data = WQ_Train,
               type = 'C-classification',
               kernel = 'radial',
               gamma = gammas,
               cross = 5)

tuned2 <- tune(svm, 
              quality ~ ., 
              data = WQ_Train, 
              kernel = "radial",
              range = costs,
              cross = 5)

tuned3 <- tune(svm,
               quality ~ ., 
              data = WQ_Train, 
              kernel = "radial",
              range = gammas,
              cross = 5)

summary(svmfit3)

## 
## Call:
## svm(formula = quality ~ ., data = WQ_Train, type = "C-classification", 
##     kernel = "radial", gamma = gammas, cross = 5)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  1334
## 
##  ( 655 679 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
## 
## 5-fold cross-validation on training data:
## 
## Total Accuracy: 79.8 
## Single Accuracies:
##  78.8 78 80.8 81.6 79.8

tuned2$best.parameters

##   Var1
## 1  0.1

tuned3$best.parameters

##   Var1
## 1  0.1

Train a svm classifier by using the RBF kernal and the corresponding optimal parameters, then make predictions on the testing dataset, report the predictive performance.

svmfit4<- svm(formula = quality ~ .,
               data = WQ_Train,
               type = 'C-classification',
               kernel = 'radial',
               cost = 0.1,
               gamma = 0.1)

svm_pred2 <- predict(svmfit4, WQ_Test)

conf_mat2 <- table(svm_pred2,WQ_Test$quality)
accuracy2 <- sum(diag(conf_mat2))/sum(conf_mat2)

conf_mat2

##          
## svm_pred2   0   1
##         0 174  76
##         1  38 212

print(paste0("Accuracy: ", round(accuracy, 3)))

## [1] "Accuracy: 0.748"

The predictive performance of this SVM using the RBF and the optimal parameters is 0.748, this means the algorithm is 74.8% accurate.

Model Comparison

We will compare the predictive performance of the two different svm classifiers trained by using the linear and RBF kernels respectively by conducting a ROC curve analysis.

library(ggplot2)
library(pROC)

svm_lin <- svm(formula = quality ~ ., 
                  data = WQ_Train, 
                  type = 'C-classification', 
                  kernel = 'linear',
                  cost = 0.1)

svm_rbf <- svm(formula = quality ~ ., 
               data = WQ_Train, 
               type = 'C-classification', 
               kernel = 'radial',
               cost = 0.1,
               gamma = 0.1)

svm_lin_pred <- predict(svm_lin, WQ_Test)
svm_rbf_pred <- predict(svm_rbf, WQ_Test)

svm_lin_pred <- as.numeric(as.character(svm_lin_pred))
svm_rbf_pred <- as.numeric(as.character(svm_rbf_pred))

auc_lin <- auc(WQ_Test$quality, svm_lin_pred)
auc_rbf <- auc(WQ_Test$quality, svm_rbf_pred)

print(paste0('The area under the curve for the linear kernels is : ', auc_lin))

## [1] "The area under the curve for the linear kernels is : 0.756354821802935"

print(paste0('The area under the curve for the RBF kernels is : ', auc_rbf))

## [1] "The area under the curve for the RBF kernels is : 0.778432914046122"

The results of comparing the area under the curve (AUC) showed that there was a better predictive performance from the RBF kernel then the Linear kernel. The AUC for the linear kernel SVM was 0.76, while the AUC for the RBF kernel SVM was 0.78. Both SVM classifiers were 74.8% accurate with prediction performance.

Below is a ROC curve graph plotting the two SVM Classifiers, it shows the higher performance of the RBF kernel to the Linear one.

roc_lin <- roc(WQ_Test$quality, svm_lin_pred)
roc_rbf <- roc(WQ_Test$quality, svm_rbf_pred)

df_lin <- data.frame(fpr = roc_lin$specificities, tpr = roc_lin$sensitivities)
df_rbf <- data.frame(fpr = roc_rbf$specificities, tpr = roc_rbf$sensitivities)

ggplot()+
  geom_line(data = df_lin, aes(x=fpr, y=tpr, color="Linear")) +
  geom_line(data = df_rbf, aes(x=fpr, y=tpr, color="RBF")) +
  labs(title = 'ROC Curve Comparison of SVM Classifiers with Linear & RBF Kernels',
       x = 'False Positive Rate (Specificity)',
       y = 'True Positive Rate (Sensitivity)'
       )+
  scale_color_manual(NULL,values = c('Linear' = 'lightblue','RBF' = 'red'))+
  theme(axis.title.x = element_text(vjust = -2),axis.title.y = element_text(vjust = 3))+
  theme_light()

Wine Quality Prediction

C.Wright

Data and Library setup

Linear Kernal

RBF Kernel

Model Comparison