library(e1071) # SVM analysis
library(ROCR) # Creating ROC curveset.seed(1)
x = matrix(rnorm(20*2), ncol = 2)
y = c(rep(-1,10), rep(1 ,10))
x[y == 1, ] = x[ y== 1, ] + 1 # Separating observation by 1
plot(x, col = (3-y)) # ploting data ## SVS with cost = 10
library(e1071)
dat = data.frame(x = x, y = as.factor(y))
svmfit1 = svm(y ∼ .,
data = dat,
kernel = "linear", # support vector clasifier
cost = 10, # cost of a violation to the margin.
scale = FALSE) # not to scale each feature to have mean zero or standard deviation oneplot(svmfit1, dat) # plot support vector classifiersvmfit1$index # return support vector ## [1] 1 2 5 7 14 16 17
summary(svmfit1) # summary of the model##
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 10,
## scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 10
## gamma: 0.5
##
## Number of Support Vectors: 7
##
## ( 4 3 )
##
##
## Number of Classes: 2
##
## Levels:
## -1 1
svmfit2 = svm(y∼.,
data=dat,
kernel ="linear",
cost =0.1,
scale =FALSE )plot(svmfit2, dat)svmfit2$index## [1] 1 2 3 4 5 7 9 10 12 13 14 15 16 17 18 20
summary(svmfit2)##
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 0.1,
## scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.1
## gamma: 0.5
##
## Number of Support Vectors: 16
##
## ( 8 8 )
##
##
## Number of Classes: 2
##
## Levels:
## -1 1
tune.out = tune(svm, #ten-fold cross-validation on a set of models of interest.
y∼.,
data = dat,
kernel = "linear",
ranges = list(cost=c(0.001, 0.01, 0.1, 1, 5, 10, 100))) # Different models
summary(tune.out)##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.1
##
## - best performance: 0.05
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.65 0.3374743
## 2 1e-02 0.65 0.3374743
## 3 1e-01 0.05 0.1581139
## 4 1e+00 0.10 0.2108185
## 5 5e+00 0.15 0.2415229
## 6 1e+01 0.15 0.2415229
## 7 1e+02 0.15 0.2415229
bestmod = tune.out$best.model # Assigning the best model
summary(bestmod) ##
## Call:
## best.tune(method = svm, train.x = y ~ ., data = dat, ranges = list(cost = c(0.001,
## 0.01, 0.1, 1, 5, 10, 100)), kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.1
## gamma: 0.5
##
## Number of Support Vectors: 16
##
## ( 8 8 )
##
##
## Number of Classes: 2
##
## Levels:
## -1 1
xtest = matrix(rnorm(20*2), ncol = 2)
ytest = sample(c(-1,1), 20, rep=TRUE)
xtest[ytest == 1, ]= xtest[ytest== 1, ] + 1
testdat = data.frame(x=xtest, y=as.factor(ytest)) # Creating a testing set
ypred = predict(bestmod, testdat) # predicting using best model
table(predict = ypred, truth = testdat$y) # confusion matrix## truth
## predict -1 1
## -1 6 5
## 1 2 7
rm(list = ls()) # clean memory
set.seed (1)
x = matrix (rnorm(200*2), ncol = 2)
x[1:100, ] = x[1:100 , ] + 2
x[101:150, ] = x[101:150, ] - 2
y = c(rep(1 , 150), rep(2, 50))
dat = data.frame(x = x, y = as.factor(y))
plot(x, col=y)train = sample(200, 100) # traing set indexes
svmfit = svm(y∼.,
data = dat[train, ],
kernel = "radial", # for radial kernel
gamma = 1,
cost = 1)plot(svmfit , dat[train ,])summary (svmfit )##
## Call:
## svm(formula = y ~ ., data = dat[train, ], kernel = "radial",
## gamma = 1, cost = 1)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 1
##
## Number of Support Vectors: 37
##
## ( 17 20 )
##
##
## Number of Classes: 2
##
## Levels:
## 1 2
set.seed(1)
tune.out = tune(svm,
y∼.,
data = dat[train, ],
kernel = "radial",
ranges = list(cost = c(0.1, 1, 10, 100, 1000),
gamma = c(0.5, 1, 2, 3, 4)))
summary (tune.out)##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 1 2
##
## - best performance: 0.12
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 1e-01 0.5 0.27 0.11595018
## 2 1e+00 0.5 0.13 0.08232726
## 3 1e+01 0.5 0.15 0.07071068
## 4 1e+02 0.5 0.17 0.08232726
## 5 1e+03 0.5 0.21 0.09944289
## 6 1e-01 1.0 0.25 0.13540064
## 7 1e+00 1.0 0.13 0.08232726
## 8 1e+01 1.0 0.16 0.06992059
## 9 1e+02 1.0 0.20 0.09428090
## 10 1e+03 1.0 0.20 0.08164966
## 11 1e-01 2.0 0.25 0.12692955
## 12 1e+00 2.0 0.12 0.09189366
## 13 1e+01 2.0 0.17 0.09486833
## 14 1e+02 2.0 0.19 0.09944289
## 15 1e+03 2.0 0.20 0.09428090
## 16 1e-01 3.0 0.27 0.11595018
## 17 1e+00 3.0 0.13 0.09486833
## 18 1e+01 3.0 0.18 0.10327956
## 19 1e+02 3.0 0.21 0.08755950
## 20 1e+03 3.0 0.22 0.10327956
## 21 1e-01 4.0 0.27 0.11595018
## 22 1e+00 4.0 0.15 0.10801234
## 23 1e+01 4.0 0.18 0.11352924
## 24 1e+02 4.0 0.21 0.08755950
## 25 1e+03 4.0 0.24 0.10749677
table(true = dat[-train, "y"],
pred = predict(tune.out$best.model, newx=dat[-train, ]))## pred
## true 1 2
## 1 56 21
## 2 18 5
# Creating function to plot ROC curve
rocplot = function(pred, truth, ...) {
predob = prediction(pred, truth)
perf = performance(predob, "tpr", "fpr")
plot(perf, ...)
}
# Optimal model based on cross-validation
svmfit.opt = svm(y ~.,
data = dat[train, ],
kernel = "radial",
gamma = 2,
cost = 1,
decision.values = T) # to obtain the fitted values for a given SVM model
# By increasing γ we can produce a more flexible
# fit and generate further improvements in accuracy
svmfit.flex = svm(y~.,
data = dat[train, ],
kernel = "radial",
gamma = 50,
cost = 1,
decision.values = T)
fitted1 = attributes(predict(svmfit.opt, dat[train, ],
decision.values = TRUE))$decision.values
fitted2 = attributes(predict(svmfit.flex, dat[train, ],
decision.values =T))$decision.values
par(mfrow = c(1,2))
# ROC plot for optimal model
rocplot(fitted1,
dat[train, "y"],
main = "Training Data")
# ROC model of flexible model
rocplot(fitted2,
dat[train, "y"],
add = T,
col = "red")
fitted3 = attributes(predict(svmfit.opt, dat[-train, ],
decision.values = T))$decision.values
fitted4 = attributes(predict(svmfit.flex, dat[-train, ],
decision.values =T))$decision.values
rocplot(fitted3,
dat[-train, "y"],
main = "Test Data")
rocplot(fitted4,
dat[-train, "y"],
add = T,
col = "red")