Try SVM

Generate a toy dataset.

library('e1071')

set.seed(102)
x = matrix(rnorm(20),10,2)
y = rep(c(-1,1),c(5,5))
x[y==1,] = x[y==1,]+2

plot(x,col=y+3)

Define a re-usable linear svm function and try on the toy data set.

makegrid <- function(x,n=75){
    grange = apply(x,2,range)
    x1_g = seq(from=grange[1,1],to=grange[2,1],length.out = n)
    x2_g = seq(from=grange[1,2],to=grange[2,2],length.out = n)
    expand.grid(x1=x1_g,x2=x2_g)
}

train_linear_svm <- function(x,y,...){
    #x: data.frame with columns x1,x2...
    #y: outcome labels in (-1,1)
    
    svmfit = svm(x,y,type='C',kernel='linear',...)
    #print(svmfit)
    #plot(svmfit,training)
    
    xgrid = makegrid(x)
    ygrid = predict(svmfit,xgrid)
    beta = t(svmfit$coefs)%*%as.matrix(x[svmfit$index,])
    beta0 = svmfit$rho    #negative intercept
    
    plot(xgrid,col=c('red','blue')[as.numeric(ygrid)],pch=20,cex=.2)
    points(x,col=y+3,pch=20)
    points(svmfit$SV,pch=5,cex=2)
    abline(beta0/beta[2],-beta[1]/beta[2])
    abline((beta0-1)/beta[2],-beta[1]/beta[2])
    abline((beta0+1)/beta[2],-beta[1]/beta[2])
    
    return(svmfit)
}

invisible(train_linear_svm(x,y,scale=F,cost = .2))

Now reproduce figure 12.2 in ESL.

#== data from ESL ==#
setwd('/Users/Nan/Dropbox/R/practice/svm/')

load('ESL.mixture.rda')
str(ESL.mixture)

## List of 8
##  $ x       : num [1:200, 1:2] 2.5261 0.367 0.7682 0.6934 -0.0198 ...
##  $ y       : num [1:200] 0 0 0 0 0 0 0 0 0 0 ...
##  $ xnew    : matrix [1:6831, 1:2] -2.6 -2.5 -2.4 -2.3 -2.2 -2.1 -2 -1.9 -1.8 -1.7 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:6831] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:2] "x1" "x2"
##  $ prob    : atomic [1:6831] 3.55e-05 3.05e-05 2.63e-05 2.27e-05 1.96e-05 ...
##   ..- attr(*, ".Names")= chr [1:6831] "1" "2" "3" "4" ...
##  $ marginal: atomic [1:6831] 6.65e-15 2.31e-14 7.62e-14 2.39e-13 7.15e-13 ...
##   ..- attr(*, ".Names")= chr [1:6831] "1" "2" "3" "4" ...
##  $ px1     : num [1:69] -2.6 -2.5 -2.4 -2.3 -2.2 -2.1 -2 -1.9 -1.8 -1.7 ...
##  $ px2     : num [1:99] -2 -1.95 -1.9 -1.85 -1.8 -1.75 -1.7 -1.65 -1.6 -1.55 ...
##  $ means   : num [1:20, 1:2] -0.2534 0.2667 2.0965 -0.0613 2.7035 ...

training = as.data.frame(ESL.mixture$x)
names(training) = c('x1','x2')

y_train = ESL.mixture$y
y_train[y_train==0] = -1

When Cost=10000:

svmfit = train_linear_svm(training,y_train,scale=F,cost = 10000)
#bayes decision boundary
contour(ESL.mixture$px1,ESL.mixture$px2,matrix(ESL.mixture$prob,ncol=length(ESL.mixture$px2)),
        levels=0.5,drawlabels = F,add=T,lty=2,lwd=2,col='purple')

names(svmfit)

##  [1] "call"            "type"            "kernel"         
##  [4] "cost"            "degree"          "gamma"          
##  [7] "coef0"           "nu"              "epsilon"        
## [10] "sparse"          "scaled"          "x.scale"        
## [13] "y.scale"         "nclasses"        "levels"         
## [16] "tot.nSV"         "nSV"             "labels"         
## [19] "SV"              "index"           "rho"            
## [22] "compprob"        "probA"           "probB"          
## [25] "sigma"           "coefs"           "na.action"      
## [28] "fitted"          "decision.values"

ct = table(y_train,svmfit$fitted)
(ct[2]+ct[3])/sum(ct)

## [1] 0.27

prob = ESL.mixture$prob
(bayes.error<-sum(ESL.mixture$marginal*(prob*I(prob<0.5)+(1-prob)*I(prob>=.5))) )

## [1] 0.2101192

pred_y = predict(svmfit,ESL.mixture$xnew)
pred_y = as.integer(levels(pred_y)[pred_y])
(test.error<-sum(ESL.mixture$marginal*(prob*I(pred_y<0)+(1-prob)*I(pred_y>=0))) )

## [1] 0.2877039

So Bayes error is 0.2101192, training error 0.27 and test error 0.2877039. When Cost=.01:

svmfit = train_linear_svm(training,y_train,scale = F,cost = .01)
#bayes decision boundary
contour(ESL.mixture$px1,ESL.mixture$px2,matrix(ESL.mixture$prob,ncol=length(ESL.mixture$px2)),
        levels=0.5,drawlabels = F,add=T,lty=2,lwd=2,col='purple')

ct = table(y_train,svmfit$fitted)
(ct[2]+ct[3])/sum(ct)

## [1] 0.26

pred_y = predict(svmfit,ESL.mixture$xnew)
pred_y = as.integer(levels(pred_y)[pred_y])
(test.error<-sum(ESL.mixture$marginal*(prob*I(pred_y<0)+(1-prob)*I(pred_y>=0))) )

## [1] 0.2999182

So training error is 0.26 and test error 0.2999182.

Now reproduce figure 12.4 in ESL.

We need to find the parameters for optimal decision boundary using cross-validation. For the RBF kernel:

set.seed(123)
tuneout =  tune(svm,training,y_train,kernel='radial',gamma=1,scale=F,
                ranges = list(cost=c(0.1,.2,.5,1,2,5,10,20,50,100)))
tuneout

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##     2
## 
## - best performance: 0.5935475

tuneout =  tune(svm,training,y_train,kernel='radial',gamma=1,scale=F,
                ranges = list(cost=seq(1,4,length.out = 20)))
tuneout

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##      cost
##  1.631579
## 
## - best performance: 0.6176497

tuneout =  tune(svm,training,y_train,kernel='radial',gamma=1,scale=F,
                ranges = list(cost=seq(1,2.5,length.out = 20)))
tuneout

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##      cost
##  2.421053
## 
## - best performance: 0.6201025

tuneout =  tune(svm,training,y_train,kernel='radial',gamma=1,scale=F,
                ranges = list(cost=seq(1.6,2.4,length.out = 20)))
tuneout

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##      cost
##  2.189474
## 
## - best performance: 0.6220007

Then we use a cost = 2.19.

svmfit = svm(training,y_train,type='C',kernel='radial',scale = F,cost = 2.19,gamma = 1)
ct = table(y_train,svmfit$fitted)
(ct[2]+ct[3])/sum(ct)

## [1] 0.16

xgrid = makegrid(training)
ygrid = predict(svmfit,xgrid)

plot(xgrid,col=c('red','blue')[as.numeric(ygrid)],pch=20,cex=.2)
points(training,col=y_train+3,pch=20)

#decision boundary grid
xcont = makegrid(training,n = 200)
xcont = xcont[order(xcont$x1,xcont$x2),]
ycont = predict(svmfit,xcont,decision.values = T)

#decision boundary
contour(unique(xcont$x1),unique(xcont$x2),
        t(matrix(attr(ycont,'decision.values'),ncol=length(unique(xcont$x2)))),
        levels=0,drawlabels = F,add=T,lty=1,lwd=2,col='black')

#hyperplane for class -1
contour(unique(xcont$x1),unique(xcont$x2),
        t(matrix(attr(ycont,'decision.values'),ncol=length(unique(xcont$x2)))),
        levels=-1,drawlabels = F,add=T,lty=1,lwd=1,col='blue')

#hyperplane for class 1
contour(unique(xcont$x1),unique(xcont$x2),
        t(matrix(attr(ycont,'decision.values'),ncol=length(unique(xcont$x2)))),
        levels=1,drawlabels = F,add=T,lty=1,lwd=1,col='red')

pred_y = predict(svmfit,ESL.mixture$xnew)
pred_y = as.integer(levels(pred_y)[pred_y])
(test.error<-sum(ESL.mixture$marginal*(prob*I(pred_y<0)+(1-prob)*I(pred_y>=0))) )

## [1] 0.2185895

So training error is 0.16 and test error 0.2185895.

Now manually predict the fitted decision values for the training data and compare the result.

rbf <- function(sv,newdata,coef,gamma=1){
    sv = t(sv)
    rbf1 <- function(x){    #for a single vector x
        diff = apply(sv-x,2,function(x) sum(x^2))
        sum(exp(-gamma*diff)*coef)
    }
    
    apply(newdata,1,rbf1)
}

beta0 = svmfit$rho    #negative intercept

my_pred = rbf(svmfit$SV,training,svmfit$coefs)-beta0

plot(my_pred,svmfit$decision.values)
abline(0,1,col='red',lwd=2)

Try SVM

Nan Wang

7/8/2015

Now reproduce figure 12.2 in ESL.

Now reproduce figure 12.4 in ESL.