#This case study is worked out by taking a data set from a very popular book, Introduction to Statistical Learning (ISLR). The data contains 1070 purchases with 18 variables, where the customer either purchased Citrus Hill or Minute Maid Orange Juice.
# For theory please refer to  my blog published in Medium. Link = https://medium.com/analytics-vidhya/comprehensive-support-vector-machines-guide-using-illusion-to-solve-reality-ad3136d8f877


library(ISLR)
mydata=OJ
dim(mydata)
## [1] 1070   18
str(OJ)
## 'data.frame':    1070 obs. of  18 variables:
##  $ Purchase      : Factor w/ 2 levels "CH","MM": 1 1 1 2 1 1 1 1 1 1 ...
##  $ WeekofPurchase: num  237 239 245 227 228 230 232 234 235 238 ...
##  $ StoreID       : num  1 1 1 1 7 7 7 7 7 7 ...
##  $ PriceCH       : num  1.75 1.75 1.86 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
##  $ PriceMM       : num  1.99 1.99 2.09 1.69 1.69 1.99 1.99 1.99 1.99 1.99 ...
##  $ DiscCH        : num  0 0 0.17 0 0 0 0 0 0 0 ...
##  $ DiscMM        : num  0 0.3 0 0 0 0 0.4 0.4 0.4 0.4 ...
##  $ SpecialCH     : num  0 0 0 0 0 0 1 1 0 0 ...
##  $ SpecialMM     : num  0 1 0 0 0 1 1 0 0 0 ...
##  $ LoyalCH       : num  0.5 0.6 0.68 0.4 0.957 ...
##  $ SalePriceMM   : num  1.99 1.69 2.09 1.69 1.69 1.99 1.59 1.59 1.59 1.59 ...
##  $ SalePriceCH   : num  1.75 1.75 1.69 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
##  $ PriceDiff     : num  0.24 -0.06 0.4 0 0 0.3 -0.1 -0.16 -0.16 -0.16 ...
##  $ Store7        : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 2 2 2 ...
##  $ PctDiscMM     : num  0 0.151 0 0 0 ...
##  $ PctDiscCH     : num  0 0 0.0914 0 0 ...
##  $ ListPriceDiff : num  0.24 0.24 0.23 0 0 0.3 0.3 0.24 0.24 0.24 ...
##  $ STORE         : num  1 1 1 1 0 0 0 0 0 0 ...
table(mydata$Purchase)
## 
##  CH  MM 
## 653 417
anyNA(mydata)
## [1] FALSE
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
## Loading required package: ggplot2
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
prop.table(table(mydata$Purchase))
## 
##        CH        MM 
## 0.6102804 0.3897196
set.seed(1234)
Index=createDataPartition(mydata$Purchase, p=0.75, list = FALSE)
Train=mydata[Index,]
Test=mydata[-Index,]

#Prepare for model by having 10 fold bootstrapped crossvalidation sampling
control=trainControl(method = "repeatedcv", number = 10, repeats = 1)

#Build a SVM model using Linear kernel
###Tuning parameter C for optimized model
grid=expand.grid(C = seq(0.5,10,.5))
set.seed(1234)
SVM_L=train(Purchase~., data = Train,method = 'svmLinear',
            trControl = control,
            tuneGrid = grid,
            preProcess = c("scale","center"))

plot(SVM_L)

#Accuracy seems to vary across various values of C

#The best value of C is found by the below
SVM_L$bestTune
##    C
## 12 6
#Training Accuracy
a=SVM_L$results
TrainingAcc_L=a[which.max(a$Accuracy),"Accuracy"]
TrainingAcc_L
## [1] 0.8331327
##Predictions
Pred_L=predict(SVM_L, Test)
CM=confusionMatrix(Pred_L,Test$Purchase)
Acc_L=CM$overall[[1]]
Acc_L
## [1] 0.82397
#Visulize the confusion matrix
CM$table
##           Reference
## Prediction  CH  MM
##         CH 143  27
##         MM  20  77
fourfoldplot(CM$table)

Sensitivity_Linear=CM$byClass[[1]]
Sensitivity_Linear
## [1] 0.8773006
Specificity_Linear=CM$byClass[[2]]
Specificity_Linear
## [1] 0.7403846
###ROC and AUC

predictions.L=prediction(as.numeric(Pred_L),Test$Purchase)
Perf.L=performance(predictions.L,"tpr","fpr")
plot(Perf.L, main="ROC - SVM with Linear Kernel")

AUC=performance(predictions.L,"auc")
AUC_L=AUC@y.values[[1]]
AUC_L
## [1] 0.8088426
#Build the model by using Polynomial Kernel
#parameter tuning for both Cost and degree of polynomial
set.seed(3456555)
grid=expand.grid(C = c(0.1,1,3,5,7,9,10), 
                 degree=c(2,3),scale=1)
svm_P=train(Purchase~.,data=Train,method="svmPoly",
            trControl=control, tuneGrid=grid)

#Best parameter values after tuning are
svm_P$bestTune
##   degree scale C
## 5      2     1 3
#Visualize
plot(svm_P)

#Training Accuracy
b=svm_P$results
TrainingAcc_Poly=b[which.max(b$Accuracy),"Accuracy"]
TrainingAcc_Poly
## [1] 0.8170062
#Predictions
Pred_Poly=predict(svm_P,Test)
b=confusionMatrix(Pred_Poly,Test$Purchase)
Accuracy_Polynomial=b$overall[[1]]
Accuracy_Polynomial
## [1] 0.7865169
#Visulize the confusion matrix
b$table
##           Reference
## Prediction  CH  MM
##         CH 142  36
##         MM  21  68
fourfoldplot(b$table)

Sensitivity_Polynomial=b$byClass[[1]]
Sensitivity_Polynomial
## [1] 0.8711656
Specificity_Polynomial=b$byClass[[2]]
Specificity_Polynomial
## [1] 0.6538462
##ROC& AUC
predictions.P=prediction(as.numeric(Pred_Poly),labels=Test$Purchase)
Perf.P=performance(predictions.P,"tpr","fpr")
plot(Perf.P, main="ROC - SVM with Polynomial Kernel")

AUC=performance(predictions.P,"auc")
AUC_P=AUC@y.values[[1]]
AUC_P
## [1] 0.7625059
#Build the model by using Radial Kernel
#parameter tuning for both Cost and sigma of radial kernel

grid=expand.grid(C = c(0.1,0.5,1,2,3,5,7.5,8,8.5,9,9.5,10), 
                 sigma=c(0.0025,0.005,0.01,0.015,0.02,0.025))
set.seed(88888)
svm_Radial=train(Purchase~.,data=Train,method="svmRadial",
                 trControl=control, tuneGrid=grid)

#Best tuned parameters for the model & Visualization
svm_Radial$bestTune
##    sigma C
## 45  0.01 8
plot(svm_Radial)

#Training Accuracy
c=svm_Radial$results
TrainingAcc_Rad=c[which.max(c$Accuracy),"Accuracy"]
TrainingAcc_Rad
## [1] 0.8331327
#Predictions
Pred_Radial=predict(svm_Radial,Test)
c=confusionMatrix(Pred_Radial,Test$Purchase)
Accuracy_Radial=c$overall[[1]]
Accuracy_Radial
## [1] 0.82397
#Visulize the confusion matrix
c$table
##           Reference
## Prediction  CH  MM
##         CH 147  31
##         MM  16  73
fourfoldplot(c$table)

Sensitivity_Radial=c$byClass[[1]]
Sensitivity_Radial
## [1] 0.9018405
Specificity_Radial=c$byClass[[2]]
Specificity_Radial
## [1] 0.7019231
#ROC & AUC
predictions.R=prediction(as.numeric(Pred_Radial),labels=Test$Purchase)
Perf.R=performance(predictions.R,"tpr","fpr")
plot(Perf.R, main="ROC - SVM with Radial Kernel")

AUC=performance(predictions.R,"auc")
AUC_R=AUC@y.values[[1]]
AUC_R
## [1] 0.8018818
##Comparison across kernels
Compare=data.frame(Kernel=c("Linear","Poly","Radial"),
                   Train_Acc=c(TrainingAcc_L,TrainingAcc_Poly,TrainingAcc_Rad),
                   Test_Acc=c(Acc_L,Accuracy_Polynomial,Accuracy_Radial),
                   Sensitivity=c(Sensitivity_Linear,Sensitivity_Polynomial,Sensitivity_Radial),
                   Specificity=c(Specificity_Linear,Specificity_Polynomial,Specificity_Radial),
                   AUC_All=c(AUC_L,AUC_P,AUC_R))
Compare
##   Kernel Train_Acc  Test_Acc Sensitivity Specificity   AUC_All
## 1 Linear 0.8331327 0.8239700   0.8773006   0.7403846 0.8088426
## 2   Poly 0.8170062 0.7865169   0.8711656   0.6538462 0.7625059
## 3 Radial 0.8331327 0.8239700   0.9018405   0.7019231 0.8018818