library(e1071) # SVM analysis
library(ROCR)  # Creating ROC curve

1 Support vector classifier (SVC)

1.1 Generating Data

set.seed(1)
x = matrix(rnorm(20*2), ncol = 2)
y = c(rep(-1,10), rep(1 ,10))
x[y == 1, ] = x[ y== 1, ] + 1 # Separating observation by 1
plot(x, col = (3-y)) # ploting data

## SVS with cost = 10

library(e1071)
dat = data.frame(x = x, y = as.factor(y))

svmfit1 = svm(y ∼ .,
              data = dat,
              kernel = "linear", # support vector clasifier 
              cost = 10, # cost of a violation to the margin.
              scale = FALSE) # not to scale each feature to have mean zero or standard deviation one

1.1.1 Model performance

plot(svmfit1, dat) # plot support vector classifier

svmfit1$index # return support vector 
## [1]  1  2  5  7 14 16 17
summary(svmfit1) # summary of the model
## 
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 10, 
##     scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  10 
##       gamma:  0.5 
## 
## Number of Support Vectors:  7
## 
##  ( 4 3 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -1 1

1.2 SVS with cost = 0.1

svmfit2 = svm(y∼.,
              data=dat,
              kernel ="linear",
              cost =0.1,
              scale =FALSE )

1.2.1 Model performance

plot(svmfit2, dat)

svmfit2$index
##  [1]  1  2  3  4  5  7  9 10 12 13 14 15 16 17 18 20
summary(svmfit2)
## 
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 0.1, 
##     scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.1 
##       gamma:  0.5 
## 
## Number of Support Vectors:  16
## 
##  ( 8 8 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -1 1

1.3 Ten-fold cross-validation on a set of models

tune.out = tune(svm, #ten-fold cross-validation on a set of models of interest.
                y∼.,
                data = dat,
                kernel = "linear",
                ranges = list(cost=c(0.001, 0.01, 0.1, 1, 5, 10, 100))) # Different models  
summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##   0.1
## 
## - best performance: 0.05 
## 
## - Detailed performance results:
##    cost error dispersion
## 1 1e-03  0.65  0.3374743
## 2 1e-02  0.65  0.3374743
## 3 1e-01  0.05  0.1581139
## 4 1e+00  0.10  0.2108185
## 5 5e+00  0.15  0.2415229
## 6 1e+01  0.15  0.2415229
## 7 1e+02  0.15  0.2415229
bestmod = tune.out$best.model # Assigning the best model
summary(bestmod) 
## 
## Call:
## best.tune(method = svm, train.x = y ~ ., data = dat, ranges = list(cost = c(0.001, 
##     0.01, 0.1, 1, 5, 10, 100)), kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.1 
##       gamma:  0.5 
## 
## Number of Support Vectors:  16
## 
##  ( 8 8 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -1 1

1.4 Testing our model on a testing set

xtest = matrix(rnorm(20*2), ncol = 2) 
ytest = sample(c(-1,1), 20, rep=TRUE)
xtest[ytest == 1, ]= xtest[ytest== 1, ] + 1
testdat = data.frame(x=xtest, y=as.factor(ytest)) # Creating a testing set
ypred = predict(bestmod, testdat) # predicting using best model
table(predict = ypred, truth = testdat$y) # confusion matrix
##        truth
## predict -1 1
##      -1  6 5
##      1   2 7

2 Support Vector Machine (SVM)

2.1 Creating data

rm(list = ls()) # clean memory
set.seed (1)
x = matrix (rnorm(200*2), ncol = 2)
x[1:100, ] = x[1:100 , ] + 2
x[101:150, ] = x[101:150, ] - 2
y = c(rep(1 , 150), rep(2, 50))
dat = data.frame(x = x, y = as.factor(y))
plot(x, col=y)

2.2 SVM using radial kernel with gamma = 1

train = sample(200, 100) # traing set indexes
svmfit = svm(y∼.,
             data = dat[train, ],
             kernel = "radial", # for radial kernel
             gamma = 1,
             cost = 1)

2.3 Model performance

plot(svmfit , dat[train ,])

summary (svmfit )
## 
## Call:
## svm(formula = y ~ ., data = dat[train, ], kernel = "radial", 
##     gamma = 1, cost = 1)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  1 
## 
## Number of Support Vectors:  37
## 
##  ( 17 20 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  1 2

2.4 Ten-fold cross-validation on a set of models

set.seed(1)
tune.out = tune(svm,
                y∼.,
                data = dat[train, ],
                kernel = "radial",
                ranges = list(cost = c(0.1, 1, 10, 100, 1000),
                              gamma = c(0.5, 1, 2, 3, 4)))
summary (tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##     1     2
## 
## - best performance: 0.12 
## 
## - Detailed performance results:
##     cost gamma error dispersion
## 1  1e-01   0.5  0.27 0.11595018
## 2  1e+00   0.5  0.13 0.08232726
## 3  1e+01   0.5  0.15 0.07071068
## 4  1e+02   0.5  0.17 0.08232726
## 5  1e+03   0.5  0.21 0.09944289
## 6  1e-01   1.0  0.25 0.13540064
## 7  1e+00   1.0  0.13 0.08232726
## 8  1e+01   1.0  0.16 0.06992059
## 9  1e+02   1.0  0.20 0.09428090
## 10 1e+03   1.0  0.20 0.08164966
## 11 1e-01   2.0  0.25 0.12692955
## 12 1e+00   2.0  0.12 0.09189366
## 13 1e+01   2.0  0.17 0.09486833
## 14 1e+02   2.0  0.19 0.09944289
## 15 1e+03   2.0  0.20 0.09428090
## 16 1e-01   3.0  0.27 0.11595018
## 17 1e+00   3.0  0.13 0.09486833
## 18 1e+01   3.0  0.18 0.10327956
## 19 1e+02   3.0  0.21 0.08755950
## 20 1e+03   3.0  0.22 0.10327956
## 21 1e-01   4.0  0.27 0.11595018
## 22 1e+00   4.0  0.15 0.10801234
## 23 1e+01   4.0  0.18 0.11352924
## 24 1e+02   4.0  0.21 0.08755950
## 25 1e+03   4.0  0.24 0.10749677

2.5 Testing on test set

table(true = dat[-train, "y"],
      pred = predict(tune.out$best.model, newx=dat[-train, ]))
##     pred
## true  1  2
##    1 56 21
##    2 18  5

3 ROC curve

# Creating function to plot ROC curve
rocplot = function(pred, truth, ...) {
  predob = prediction(pred, truth)
  perf = performance(predob, "tpr", "fpr")
  plot(perf, ...)
  }
# Optimal model based on cross-validation
svmfit.opt = svm(y ~.,
                 data = dat[train, ],
                 kernel = "radial",
                 gamma = 2,
                 cost = 1,
                 decision.values = T) # to obtain the fitted values for a given SVM model

# By increasing γ we can produce a more flexible
# fit and generate further improvements in accuracy
svmfit.flex = svm(y~.,
                  data = dat[train, ],
                  kernel = "radial",
                  gamma = 50,
                  cost = 1,
                  decision.values = T)

fitted1 = attributes(predict(svmfit.opt, dat[train, ],
                            decision.values = TRUE))$decision.values
fitted2 = attributes(predict(svmfit.flex, dat[train, ],
                            decision.values =T))$decision.values
par(mfrow = c(1,2))

# ROC plot for optimal model
rocplot(fitted1,                
        dat[train, "y"],
        main = "Training Data")
# ROC model of flexible model
rocplot(fitted2, 
        dat[train, "y"],
        add = T,
        col = "red")

fitted3 = attributes(predict(svmfit.opt, dat[-train, ],
                            decision.values = T))$decision.values
fitted4 = attributes(predict(svmfit.flex, dat[-train, ],
                            decision.values =T))$decision.values

rocplot(fitted3,
        dat[-train, "y"],
        main = "Test Data")

rocplot(fitted4,
        dat[-train, "y"],
        add = T,
        col = "red")